Index: cmake/modules/HandleLLVMOptions.cmake =================================================================== --- cmake/modules/HandleLLVMOptions.cmake +++ cmake/modules/HandleLLVMOptions.cmake @@ -115,7 +115,7 @@ set(LLVM_ON_UNIX 0) endif(CYGWIN) else(WIN32) - if(FUCHSIA OR UNIX) + if(UNIX) set(LLVM_ON_WIN32 0) set(LLVM_ON_UNIX 1) if(APPLE OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX") @@ -123,9 +123,9 @@ else() set(LLVM_HAVE_LINK_VERSION_SCRIPT 1) endif() - else(FUCHSIA OR UNIX) + else(UNIX) MESSAGE(SEND_ERROR "Unable to determine platform") - endif(FUCHSIA OR UNIX) + endif(UNIX) endif(WIN32) set(EXEEXT ${CMAKE_EXECUTABLE_SUFFIX}) Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -5318,7 +5318,7 @@ The existence of the ``invariant.group`` metadata on the instruction tells the optimizer that every ``load`` and ``store`` to the same pointer operand within the same invariant group can be assumed to load or store the same -value (but see the ``llvm.launder.invariant.group`` intrinsic which affects +value (but see the ``llvm.invariant.group.barrier`` intrinsic which affects when two pointers are considered the same). Pointers returned by bitcast or getelementptr with only zero indices are considered the same. @@ -5343,13 +5343,13 @@ store i8 %unknownValue, i8* %ptr, !invariant.group !0 ; Can assume that %unknownValue == 42 call void @foo(i8* %ptr) - %newPtr2 = call i8* @llvm.launder.invariant.group(i8* %ptr) - %d = load i8, i8* %newPtr2, !invariant.group !0 ; Can't step through launder.invariant.group to get value of %ptr + %newPtr2 = call i8* @llvm.invariant.group.barrier(i8* %ptr) + %d = load i8, i8* %newPtr2, !invariant.group !0 ; Can't step through invariant.group.barrier to get value of %ptr ... declare void @foo(i8*) declare i8* @getPointer(i8*) - declare i8* @llvm.launder.invariant.group(i8*) + declare i8* @llvm.invariant.group.barrier(i8*) !0 = !{!"magic ptr"} !1 = !{!"other ptr"} @@ -12908,7 +12908,7 @@ This intrinsic indicates that the memory is mutable again. -'``llvm.launder.invariant.group``' Intrinsic +'``llvm.invariant.group.barrier``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -12919,12 +12919,12 @@ :: - declare i8* @llvm.launder.invariant.group.p0i8(i8* ) + declare i8* @llvm.invariant.group.barrier.p0i8(i8* ) Overview: """"""""" -The '``llvm.launder.invariant.group``' intrinsic can be used when an invariant +The '``llvm.invariant.group.barrier``' intrinsic can be used when an invariant established by invariant.group metadata no longer holds, to obtain a new pointer value that does not carry the invariant information. It is an experimental intrinsic, which means that its semantics might change in the future. @@ -12933,7 +12933,7 @@ Arguments: """""""""" -The ``llvm.launder.invariant.group`` takes only one argument, which is +The ``llvm.invariant.group.barrier`` takes only one argument, which is the pointer to the memory for which the ``invariant.group`` no longer holds. Semantics: @@ -12941,7 +12941,6 @@ Returns another pointer that aliases its argument but which is considered different for the purposes of ``load``/``store`` ``invariant.group`` metadata. -It does not read any accessible memory and the execution can be speculated. .. _constrainedfp: Index: docs/ReleaseNotes.rst =================================================================== --- docs/ReleaseNotes.rst +++ docs/ReleaseNotes.rst @@ -107,8 +107,6 @@ have changed. Alignment is no longer an argument, and are instead conveyed as parameter attributes. -* invariant.group.barrier has been renamed to launder.invariant.group. - Changes to the ARM Backend -------------------------- Index: docs/XRay.rst =================================================================== --- docs/XRay.rst +++ docs/XRay.rst @@ -171,6 +171,20 @@ | xray_logfile_base | ``const char*`` | ``xray-log.`` | Filename base for the | | | | | XRay logfile. | +-------------------+-----------------+---------------+------------------------+ +| xray_naive_log | ``bool`` | ``false`` | **DEPRECATED:** Use | +| | | | xray_mode=xray-basic | +| | | | instead. Whether to | +| | | | install the basic log | +| | | | the naive log | +| | | | implementation. | ++-------------------+-----------------+---------------+------------------------+ +| xray_fdr_log | ``bool`` | ``false`` | **DEPRECATED:** Use | +| | | | xray_mode=xray-fdr | +| | | | instead. Whether to | +| | | | install the Flight | +| | | | Data Recorder | +| | | | (FDR) mode. | ++-------------------+-----------------+---------------+------------------------+ | verbosity | ``int`` | ``0`` | Runtime verbosity | | | | | level. | +-------------------+-----------------+---------------+------------------------+ @@ -179,45 +193,30 @@ If you choose to not use the default logging implementation that comes with the XRay runtime and/or control when/how the XRay instrumentation runs, you may use the XRay APIs directly for doing so. To do this, you'll need to include the -``xray_log_interface.h`` from the compiler-rt ``xray`` directory. The important API +``xray_interface.h`` from the compiler-rt ``xray`` directory. The important API functions we list below: -- ``__xray_log_register_mode(...)``: Register a logging implementation against - a string Mode identifier. The implementation is an instance of - ``XRayLogImpl`` defined in ``xray/xray_log_interface.h``. -- ``__xray_log_select_mode(...)``: Select the mode to install, associated with - a string Mode identifier. Only implementations registered with - ``__xray_log_register_mode(...)`` can be chosen with this function. -- ``__xray_log_init_mode(...)``: This function allows for initializing and - re-initializing an installed logging implementation. See - ``xray/xray_log_interface.h`` for details, part of the XRay compiler-rt - installation. - -Once a logging implementation has been initialized, it can be "stopped" by -finalizing the implementation through the ``__xray_log_finalize()`` function. -The finalization routine is the opposite of the initialization. When finalized, -an implementation's data can be cleared out through the -``__xray_log_flushLog()`` function. For implementations that support in-memory -processing, these should register an iterator function to provide access to the -data via the ``__xray_log_set_buffer_iterator(...)`` which allows code calling -the ``__xray_log_process_buffers(...)`` function to deal with the data in -memory. - -All of this is better explained in the ``xray/xray_log_interface.h`` header. - -Basic Mode ----------- - -XRay supports a basic logging mode which will trace the application's -execution, and periodically append to a single log. This mode can be -installed/enabled by setting ``xray_mode=xray-basic`` in the ``XRAY_OPTIONS`` -environment variable. Combined with ``patch_premain=true`` this can allow for -tracing applications from start to end. - -Like all the other modes installed through ``__xray_log_select_mode(...)``, the -implementation can be configured through the ``__xray_log_init_mode(...)`` -function, providing the mode string and the flag options. Basic-mode specific -defaults can be provided in the ``XRAY_BASIC_OPTIONS`` environment variable. +- ``__xray_set_handler(void (*entry)(int32_t, XRayEntryType))``: Install your + own logging handler for when an event is encountered. See + ``xray/xray_interface.h`` for more details. +- ``__xray_remove_handler()``: Removes whatever the installed handler is. +- ``__xray_patch()``: Patch all the instrumentation points defined in the + binary. +- ``__xray_unpatch()``: Unpatch the instrumentation points defined in the + binary. + +There are some requirements on the logging handler to be installed for the +thread-safety of operations to be performed by the XRay runtime library: + +- The function should be thread-safe, as multiple threads may be invoking the + function at the same time. If the logging function needs to do + synchronisation, it must do so internally as XRay does not provide any + synchronisation guarantees outside from the atomicity of updates to the + pointer. +- The pointer provided to ``__xray_set_handler(...)`` must be live even after + calls to ``__xray_remove_handler()`` and ``__xray_unpatch()`` have succeeded. + XRay cannot guarantee that all threads that have ever gotten a copy of the + pointer will not invoke the function. Flight Data Recorder Mode ------------------------- @@ -227,12 +226,9 @@ very much like a plane's "black box" which keeps recording data to memory in a fixed-size circular queue of buffers, and have the data available programmatically until the buffers are finalized and flushed. To use FDR mode -on your application, you may set the ``xray_mode`` variable to ``xray-fdr`` in -the ``XRAY_OPTIONS`` environment variable. Additional options to the FDR mode -implementation can be provided in the ``XRAY_FDR_OPTIONS`` environment -variable. Programmatic configuration can be done by calling -``__xray_log_init_mode("xray-fdr", )`` once it has been -selected/installed. +on your application, you may set the ``xray_fdr_log`` option to ``true`` in the +``XRAY_OPTIONS`` environment variable (while also optionally setting the +``xray_naive_log`` to ``false``). When the buffers are flushed to disk, the result is a binary trace format described by `XRay FDR format `_ @@ -264,15 +260,34 @@ } The default settings for the FDR mode implementation will create logs named -similarly to the basic log implementation, but will have a different log +similarly to the naive log implementation, but will have a different log format. All the trace analysis tools (and the trace reading library) will support all versions of the FDR mode format as we add more functionality and record types in the future. - **NOTE:** We do not promise perpetual support for when we update the log - versions we support going forward. Deprecation of the formats will be + **NOTE:** We do not however promise perpetual support for when we update the + log versions we support going forward. Deprecation of the formats will be announced and discussed on the developers mailing list. +XRay allows for replacing the default FDR mode logging implementation using the +following API: + +- ``__xray_set_log_impl(...)``: This function takes a struct of type + ``XRayLogImpl``, which is defined in ``xray/xray_log_interface.h``, part of + the XRay compiler-rt installation. +- ``__xray_log_register_mode(...)``: Register a logging implementation against + a string Mode. The implementation is an instance of ``XRayLogImpl`` defined + in ``xray/xray_log_interface.h``. +- ``__xray_log_select_mode(...)``: Select the mode to install, associated with + a string Mode. Only implementations registered with + ``__xray_log_register_mode(...)`` can be chosen with this function. When + successful, has the same effects as calling ``__xray_set_log_impl(...)`` with + the registered logging implementation. +- ``__xray_log_init(...)``: This function allows for initializing and + re-initializing an installed logging implementation. See + ``xray/xray_log_interface.h`` for details, part of the XRay compiler-rt + installation. + Trace Analysis Tools -------------------- @@ -286,7 +301,7 @@ options for sorting, and output formats (supports CSV, YAML, and console-friendly TEXT). - ``convert``: Converts an XRay log file from one format to another. We can - convert from binary XRay traces (both basic and FDR mode) to YAML, + convert from binary XRay traces (both naive and FDR mode) to YAML, `flame-graph `_ friendly text formats, as well as `Chrome Trace Viewer (catapult) ` formats. Index: include/llvm/CodeGen/GlobalISel/InstructionSelector.h =================================================================== --- include/llvm/CodeGen/GlobalISel/InstructionSelector.h +++ include/llvm/CodeGen/GlobalISel/InstructionSelector.h @@ -221,7 +221,6 @@ /// Add a temporary register to the specified instruction /// - InsnID - Instruction ID to modify /// - TempRegID - The temporary register ID to add - /// - TempRegFlags - The register flags to set GIR_AddTempRegister, /// Add an immediate to the specified instruction /// - InsnID - Instruction ID to modify @@ -277,8 +276,6 @@ /// Increment the rule coverage counter. /// - RuleID - The ID of the rule that was covered. GIR_Coverage, - - GIU_NumOpcodes, }; enum { @@ -344,10 +341,6 @@ const RegisterBankInfo &RBI, const PredicateBitset &AvailableFeatures, CodeGenCoverage &CoverageInfo) const; - virtual const int64_t *getMatchTable() const { - llvm_unreachable("Should have been overridden by tablegen if used"); - } - virtual bool testImmPredicate_I64(unsigned, int64_t) const { llvm_unreachable("Subclasses must override this to use tablegen"); } Index: include/llvm/CodeGen/MachineInstr.h =================================================================== --- include/llvm/CodeGen/MachineInstr.h +++ include/llvm/CodeGen/MachineInstr.h @@ -80,21 +80,7 @@ FrameDestroy = 1 << 1, // Instruction is used as a part of // function frame destruction code. BundledPred = 1 << 2, // Instruction has bundled predecessors. - BundledSucc = 1 << 3, // Instruction has bundled successors. - FmNoNans = 1 << 4, // Instruction does not support Fast - // math nan values. - FmNoInfs = 1 << 5, // Instruction does not support Fast - // math infinity values. - FmNsz = 1 << 6, // Instruction is not required to retain - // signed zero values. - FmArcp = 1 << 7, // Instruction supports Fast math - // reciprocal approximations. - FmContract = 1 << 8, // Instruction supports Fast math - // contraction operations like fma. - FmAfn = 1 << 9, // Instruction may map to Fast math - // instrinsic approximation. - FmReassoc = 1 << 10 // Instruction supports Fast math - // reassociation of operand order. + BundledSucc = 1 << 3 // Instruction has bundled successors. }; private: @@ -107,7 +93,7 @@ using OperandCapacity = ArrayRecycler::Capacity; OperandCapacity CapOperands; // Capacity of the Operands array. - uint16_t Flags = 0; // Various bits of additional + uint8_t Flags = 0; // Various bits of additional // information about machine // instruction. @@ -200,7 +186,7 @@ /// Set a MI flag. void setFlag(MIFlag Flag) { - Flags |= (uint16_t)Flag; + Flags |= (uint8_t)Flag; } void setFlags(unsigned flags) { @@ -211,7 +197,7 @@ /// clearFlag - Clear a MI flag. void clearFlag(MIFlag Flag) { - Flags &= ~((uint16_t)Flag); + Flags &= ~((uint8_t)Flag); } /// Return true if MI is in a bundle (but not the first MI in a bundle). Index: include/llvm/CodeGen/TargetRegisterInfo.h =================================================================== --- include/llvm/CodeGen/TargetRegisterInfo.h +++ include/llvm/CodeGen/TargetRegisterInfo.h @@ -995,12 +995,6 @@ /// of the set as well. bool checkAllSuperRegsMarked(const BitVector &RegisterSet, ArrayRef Exceptions = ArrayRef()) const; - - virtual const TargetRegisterClass * - getConstrainedRegClassForOperand(const MachineOperand &MO, - const MachineRegisterInfo &MRI) const { - return nullptr; - } }; //===----------------------------------------------------------------------===// Index: include/llvm/IR/IRBuilder.h =================================================================== --- include/llvm/IR/IRBuilder.h +++ include/llvm/IR/IRBuilder.h @@ -1476,19 +1476,14 @@ return CreateConstInBoundsGEP2_32(Ty, Ptr, 0, Idx, Name); } - Value *CreateStructGEP(Value *Ptr, unsigned Idx, const Twine &Name = "") { - return CreateConstInBoundsGEP2_32(nullptr, Ptr, 0, Idx, Name); - } - /// Same as CreateGlobalString, but return a pointer with "i8*" type /// instead of a pointer to array of i8. - Constant *CreateGlobalStringPtr(StringRef Str, const Twine &Name = "", - unsigned AddressSpace = 0) { - GlobalVariable *GV = CreateGlobalString(Str, Name, AddressSpace); - Constant *Zero = ConstantInt::get(Type::getInt32Ty(Context), 0); - Constant *Indices[] = {Zero, Zero}; - return ConstantExpr::getInBoundsGetElementPtr(GV->getValueType(), GV, - Indices); + Value *CreateGlobalStringPtr(StringRef Str, const Twine &Name = "", + unsigned AddressSpace = 0) { + GlobalVariable *gv = CreateGlobalString(Str, Name, AddressSpace); + Value *zero = ConstantInt::get(Type::getInt32Ty(Context), 0); + Value *Args[] = { zero, zero }; + return CreateInBoundsGEP(gv->getValueType(), gv, Args, Name); } //===--------------------------------------------------------------------===// @@ -1962,26 +1957,28 @@ Name); } - /// Create a launder.invariant.group intrinsic call. If Ptr type is - /// different from pointer to i8, it's casted to pointer to i8 in the same - /// address space before call and casted back to Ptr type after call. - Value *CreateLaunderInvariantGroup(Value *Ptr) { + /// Create an invariant.group.barrier intrinsic call, that stops + /// optimizer to propagate equality using invariant.group metadata. + /// If Ptr type is different from pointer to i8, it's casted to pointer to i8 + /// in the same address space before call and casted back to Ptr type after + /// call. + Value *CreateInvariantGroupBarrier(Value *Ptr) { assert(isa(Ptr->getType()) && - "launder.invariant.group only applies to pointers."); + "invariant.group.barrier only applies to pointers."); auto *PtrType = Ptr->getType(); auto *Int8PtrTy = getInt8PtrTy(PtrType->getPointerAddressSpace()); if (PtrType != Int8PtrTy) Ptr = CreateBitCast(Ptr, Int8PtrTy); Module *M = BB->getParent()->getParent(); - Function *FnLaunderInvariantGroup = Intrinsic::getDeclaration( - M, Intrinsic::launder_invariant_group, {Int8PtrTy}); + Function *FnInvariantGroupBarrier = Intrinsic::getDeclaration( + M, Intrinsic::invariant_group_barrier, {Int8PtrTy}); - assert(FnLaunderInvariantGroup->getReturnType() == Int8PtrTy && - FnLaunderInvariantGroup->getFunctionType()->getParamType(0) == + assert(FnInvariantGroupBarrier->getReturnType() == Int8PtrTy && + FnInvariantGroupBarrier->getFunctionType()->getParamType(0) == Int8PtrTy && - "LaunderInvariantGroup should take and return the same type"); + "InvariantGroupBarrier should take and return the same type"); - CallInst *Fn = CreateCall(FnLaunderInvariantGroup, {Ptr}); + CallInst *Fn = CreateCall(FnInvariantGroupBarrier, {Ptr}); if (PtrType != Int8PtrTy) return CreateBitCast(Fn, PtrType); Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -710,7 +710,7 @@ llvm_anyptr_ty], [IntrArgMemOnly, NoCapture<2>]>; -// launder.invariant.group can't be marked with 'readnone' (IntrNoMem), +// invariant.group.barrier can't be marked with 'readnone' (IntrNoMem), // because it would cause CSE of two barriers with the same argument. // Inaccessiblememonly says that the barrier doesn't read the argument, // but it changes state not accessible to this module. This way @@ -722,9 +722,9 @@ // it would remove barrier. // Note that it is still experimental, which means that its semantics // might change in the future. -def int_launder_invariant_group : Intrinsic<[llvm_anyptr_ty], +def int_invariant_group_barrier : Intrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>], - [IntrInaccessibleMemOnly, IntrSpeculatable]>; + [IntrInaccessibleMemOnly]>; //===------------------------ Stackmap Intrinsics -------------------------===// // Index: include/llvm/IR/Value.h =================================================================== --- include/llvm/IR/Value.h +++ include/llvm/IR/Value.h @@ -509,16 +509,15 @@ static_cast(this)->stripPointerCasts()); } - /// Strip off pointer casts, all-zero GEPs, aliases and invariant group - /// info. + /// Strip off pointer casts, all-zero GEPs, aliases and barriers. /// /// Returns the original uncasted value. If this is called on a non-pointer /// value, it returns 'this'. This function should be used only in /// Alias analysis. - const Value *stripPointerCastsAndInvariantGroups() const; - Value *stripPointerCastsAndInvariantGroups() { + const Value *stripPointerCastsAndBarriers() const; + Value *stripPointerCastsAndBarriers() { return const_cast( - static_cast(this)->stripPointerCastsAndInvariantGroups()); + static_cast(this)->stripPointerCastsAndBarriers()); } /// Strip off pointer casts and all-zero GEPs. Index: include/llvm/MC/MCObjectStreamer.h =================================================================== --- include/llvm/MC/MCObjectStreamer.h +++ include/llvm/MC/MCObjectStreamer.h @@ -85,8 +85,6 @@ /// will be used as a symbol offset within the fragment. void flushPendingLabels(MCFragment *F, uint64_t FOffset = 0); - void addFragmentAtoms(); - public: void visitUsedSymbol(const MCSymbol &Sym) override; Index: include/llvm/Transforms/Scalar/Reassociate.h =================================================================== --- include/llvm/Transforms/Scalar/Reassociate.h +++ include/llvm/Transforms/Scalar/Reassociate.h @@ -29,7 +29,6 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/ValueHandle.h" -#include namespace llvm { @@ -70,14 +69,9 @@ /// Reassociate commutative expressions. class ReassociatePass : public PassInfoMixin { -public: - using OrderedSet = - SetVector, std::deque>>; - -protected: DenseMap RankMap; DenseMap, unsigned> ValueRankMap; - OrderedSet RedoInsts; + SetVector> RedoInsts; // Arbitrary, but prevents quadratic behavior. static const unsigned GlobalReassociateLimit = 10; @@ -114,7 +108,8 @@ SmallVectorImpl &Ops); Value *RemoveFactorFromExpression(Value *V, Value *Factor); void EraseInst(Instruction *I); - void RecursivelyEraseDeadInsts(Instruction *I, OrderedSet &Insts); + void RecursivelyEraseDeadInsts(Instruction *I, + SetVector> &Insts); void OptimizeInst(Instruction *I); Instruction *canonicalizeNegConstExpr(Instruction *I); void BuildPairMap(ReversePostOrderTraversal &RPOT); Index: lib/Analysis/BasicAliasAnalysis.cpp =================================================================== --- lib/Analysis/BasicAliasAnalysis.cpp +++ lib/Analysis/BasicAliasAnalysis.cpp @@ -985,8 +985,8 @@ const GEPOperator *GEP2, uint64_t V2Size, const DataLayout &DL) { - assert(GEP1->getPointerOperand()->stripPointerCastsAndInvariantGroups() == - GEP2->getPointerOperand()->stripPointerCastsAndInvariantGroups() && + assert(GEP1->getPointerOperand()->stripPointerCastsAndBarriers() == + GEP2->getPointerOperand()->stripPointerCastsAndBarriers() && GEP1->getPointerOperandType() == GEP2->getPointerOperandType() && "Expected GEPs with the same pointer operand"); @@ -1264,8 +1264,8 @@ // If we know the two GEPs are based off of the exact same pointer (and not // just the same underlying object), see if that tells us anything about // the resulting pointers. - if (GEP1->getPointerOperand()->stripPointerCastsAndInvariantGroups() == - GEP2->getPointerOperand()->stripPointerCastsAndInvariantGroups() && + if (GEP1->getPointerOperand()->stripPointerCastsAndBarriers() == + GEP2->getPointerOperand()->stripPointerCastsAndBarriers() && GEP1->getPointerOperandType() == GEP2->getPointerOperandType()) { AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, DL); // If we couldn't find anything interesting, don't abandon just yet. @@ -1578,8 +1578,8 @@ return NoAlias; // Strip off any casts if they exist. - V1 = V1->stripPointerCastsAndInvariantGroups(); - V2 = V2->stripPointerCastsAndInvariantGroups(); + V1 = V1->stripPointerCastsAndBarriers(); + V2 = V2->stripPointerCastsAndBarriers(); // If V1 or V2 is undef, the result is NoAlias because we can always pick a // value for undef that aliases nothing in the program. Index: lib/Analysis/MemorySSA.cpp =================================================================== --- lib/Analysis/MemorySSA.cpp +++ lib/Analysis/MemorySSA.cpp @@ -352,6 +352,9 @@ const Instruction *I) { // If the memory can't be changed, then loads of the memory can't be // clobbered. + // + // FIXME: We should handle invariant groups, as well. It's a bit harder, + // because we need to pay close attention to invariant group barriers. return isa(I) && (I->getMetadata(LLVMContext::MD_invariant_load) || AA.pointsToConstantMemory(cast(I)-> getPointerOperand())); Index: lib/Analysis/ScalarEvolution.cpp =================================================================== --- lib/Analysis/ScalarEvolution.cpp +++ lib/Analysis/ScalarEvolution.cpp @@ -6932,12 +6932,63 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock, bool AllowPredicates) { - assert(L->contains(ExitingBlock) && "Exit count for non-loop block?"); - // If our exiting block does not dominate the latch, then its connection with - // loop's exit limit may be far from trivial. - const BasicBlock *Latch = L->getLoopLatch(); - if (!Latch || !DT.dominates(ExitingBlock, Latch)) - return getCouldNotCompute(); + // Okay, we've chosen an exiting block. See what condition causes us to exit + // at this block and remember the exit block and whether all other targets + // lead to the loop header. + bool MustExecuteLoopHeader = true; + BasicBlock *Exit = nullptr; + for (auto *SBB : successors(ExitingBlock)) + if (!L->contains(SBB)) { + if (Exit) // Multiple exit successors. + return getCouldNotCompute(); + Exit = SBB; + } else if (SBB != L->getHeader()) { + MustExecuteLoopHeader = false; + } + + // At this point, we know we have a conditional branch that determines whether + // the loop is exited. However, we don't know if the branch is executed each + // time through the loop. If not, then the execution count of the branch will + // not be equal to the trip count of the loop. + // + // Currently we check for this by checking to see if the Exit branch goes to + // the loop header. If so, we know it will always execute the same number of + // times as the loop. We also handle the case where the exit block *is* the + // loop header. This is common for un-rotated loops. + // + // If both of those tests fail, walk up the unique predecessor chain to the + // header, stopping if there is an edge that doesn't exit the loop. If the + // header is reached, the execution count of the branch will be equal to the + // trip count of the loop. + // + // More extensive analysis could be done to handle more cases here. + // + if (!MustExecuteLoopHeader && ExitingBlock != L->getHeader()) { + // The simple checks failed, try climbing the unique predecessor chain + // up to the header. + bool Ok = false; + for (BasicBlock *BB = ExitingBlock; BB; ) { + BasicBlock *Pred = BB->getUniquePredecessor(); + if (!Pred) + return getCouldNotCompute(); + TerminatorInst *PredTerm = Pred->getTerminator(); + for (const BasicBlock *PredSucc : PredTerm->successors()) { + if (PredSucc == BB) + continue; + // If the predecessor has a successor that isn't BB and isn't + // outside the loop, assume the worst. + if (L->contains(PredSucc)) + return getCouldNotCompute(); + } + if (Pred == L->getHeader()) { + Ok = true; + break; + } + BB = Pred; + } + if (!Ok) + return getCouldNotCompute(); + } bool IsOnlyExit = (L->getExitingBlock() != nullptr); TerminatorInst *Term = ExitingBlock->getTerminator(); @@ -6952,19 +7003,9 @@ /*ControlsExit=*/IsOnlyExit, AllowPredicates); } - if (SwitchInst *SI = dyn_cast(Term)) { - // For switch, make sure that there is a single exit from the loop. - BasicBlock *Exit = nullptr; - for (auto *SBB : successors(ExitingBlock)) - if (!L->contains(SBB)) { - if (Exit) // Multiple exit successors. - return getCouldNotCompute(); - Exit = SBB; - } - assert(Exit && "Exiting block must have at least one exit"); + if (SwitchInst *SI = dyn_cast(Term)) return computeExitLimitFromSingleExitSwitch(L, SI, Exit, /*ControlsExit=*/IsOnlyExit); - } return getCouldNotCompute(); } Index: lib/CodeGen/AsmPrinter/AsmPrinter.cpp =================================================================== --- lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2235,7 +2235,6 @@ unsigned Size = DL.getTypeAllocSize(CDS->getType()); unsigned EmittedSize = DL.getTypeAllocSize(CDS->getType()->getElementType()) * CDS->getNumElements(); - assert(EmittedSize <= Size && "Size cannot be less than EmittedSize!"); if (unsigned Padding = Size - EmittedSize) AP.OutStreamer->EmitZeros(Padding); } Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -1668,7 +1668,7 @@ InsertedInsts.insert(ExtVal); return true; } - case Intrinsic::launder_invariant_group: + case Intrinsic::invariant_group_barrier: II->replaceAllUsesWith(II->getArgOperand(0)); II->eraseFromParent(); return true; Index: lib/CodeGen/GlobalISel/Utils.cpp =================================================================== --- lib/CodeGen/GlobalISel/Utils.cpp +++ lib/CodeGen/GlobalISel/Utils.cpp @@ -58,13 +58,6 @@ // register class constraints on some of their operands: If it's a use, we can // skip constraining as the instruction defining the register would constrain // it. - - // We can't constrain unallocatable register classes, because we can't create - // virtual registers for these classes, so we need to let targets handled this - // case. - if (RegClass && !RegClass->isAllocatable()) - RegClass = TRI.getConstrainedRegClassForOperand(RegMO, MRI); - if (!RegClass) { assert((!isTargetSpecificOpcode(II.getOpcode()) || RegMO.isUse()) && "Register class constraint is required unless either the " Index: lib/CodeGen/MIRParser/MILexer.h =================================================================== --- lib/CodeGen/MIRParser/MILexer.h +++ lib/CodeGen/MIRParser/MILexer.h @@ -64,13 +64,6 @@ kw_tied_def, kw_frame_setup, kw_frame_destroy, - kw_nnan, - kw_ninf, - kw_nsz, - kw_arcp, - kw_contract, - kw_afn, - kw_reassoc, kw_debug_location, kw_cfi_same_value, kw_cfi_offset, Index: lib/CodeGen/MIRParser/MILexer.cpp =================================================================== --- lib/CodeGen/MIRParser/MILexer.cpp +++ lib/CodeGen/MIRParser/MILexer.cpp @@ -212,13 +212,6 @@ .Case("tied-def", MIToken::kw_tied_def) .Case("frame-setup", MIToken::kw_frame_setup) .Case("frame-destroy", MIToken::kw_frame_destroy) - .Case("nnan", MIToken::kw_nnan) - .Case("ninf", MIToken::kw_ninf) - .Case("nsz", MIToken::kw_nsz) - .Case("arcp", MIToken::kw_arcp) - .Case("contract", MIToken::kw_contract) - .Case("afn", MIToken::kw_afn) - .Case("reassoc", MIToken::kw_reassoc) .Case("debug-location", MIToken::kw_debug_location) .Case("same_value", MIToken::kw_cfi_same_value) .Case("offset", MIToken::kw_cfi_offset) Index: lib/CodeGen/MIRParser/MIParser.cpp =================================================================== --- lib/CodeGen/MIRParser/MIParser.cpp +++ lib/CodeGen/MIRParser/MIParser.cpp @@ -936,36 +936,13 @@ } bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) { - // Allow frame and fast math flags for OPCODE + // Allow both: + // * frame-setup frame-destroy OPCODE + // * frame-destroy frame-setup OPCODE while (Token.is(MIToken::kw_frame_setup) || - Token.is(MIToken::kw_frame_destroy) || - Token.is(MIToken::kw_nnan) || - Token.is(MIToken::kw_ninf) || - Token.is(MIToken::kw_nsz) || - Token.is(MIToken::kw_arcp) || - Token.is(MIToken::kw_contract) || - Token.is(MIToken::kw_afn) || - Token.is(MIToken::kw_reassoc)) { - // Mine frame and fast math flags - if (Token.is(MIToken::kw_frame_setup)) - Flags |= MachineInstr::FrameSetup; - if (Token.is(MIToken::kw_frame_destroy)) - Flags |= MachineInstr::FrameDestroy; - if (Token.is(MIToken::kw_nnan)) - Flags |= MachineInstr::FmNoNans; - if (Token.is(MIToken::kw_ninf)) - Flags |= MachineInstr::FmNoInfs; - if (Token.is(MIToken::kw_nsz)) - Flags |= MachineInstr::FmNsz; - if (Token.is(MIToken::kw_arcp)) - Flags |= MachineInstr::FmArcp; - if (Token.is(MIToken::kw_contract)) - Flags |= MachineInstr::FmContract; - if (Token.is(MIToken::kw_afn)) - Flags |= MachineInstr::FmAfn; - if (Token.is(MIToken::kw_reassoc)) - Flags |= MachineInstr::FmReassoc; - + Token.is(MIToken::kw_frame_destroy)) { + Flags |= Token.is(MIToken::kw_frame_setup) ? MachineInstr::FrameSetup + : MachineInstr::FrameDestroy; lex(); } if (Token.isNot(MIToken::Identifier)) Index: lib/CodeGen/MIRPrinter.cpp =================================================================== --- lib/CodeGen/MIRPrinter.cpp +++ lib/CodeGen/MIRPrinter.cpp @@ -680,20 +680,6 @@ OS << "frame-setup "; if (MI.getFlag(MachineInstr::FrameDestroy)) OS << "frame-destroy "; - if (MI.getFlag(MachineInstr::FmNoNans)) - OS << "nnan "; - if (MI.getFlag(MachineInstr::FmNoInfs)) - OS << "ninf "; - if (MI.getFlag(MachineInstr::FmNsz)) - OS << "nsz "; - if (MI.getFlag(MachineInstr::FmArcp)) - OS << "arcp "; - if (MI.getFlag(MachineInstr::FmContract)) - OS << "contract "; - if (MI.getFlag(MachineInstr::FmAfn)) - OS << "afn "; - if (MI.getFlag(MachineInstr::FmReassoc)) - OS << "reassoc "; OS << TII->getName(MI.getOpcode()); if (I < E) Index: lib/CodeGen/MachineCSE.cpp =================================================================== --- lib/CodeGen/MachineCSE.cpp +++ lib/CodeGen/MachineCSE.cpp @@ -445,13 +445,15 @@ // Heuristics #3: If the common subexpression is used by PHIs, do not reuse // it unless the defined value is already used in the BB of the new use. bool HasPHI = false; - for (MachineInstr &UseMI : MRI->use_nodbg_instructions(CSReg)) { - HasPHI |= UseMI.isPHI(); - if (UseMI.getParent() == MI->getParent()) - return true; + SmallPtrSet CSBBs; + for (MachineInstr &MI : MRI->use_nodbg_instructions(CSReg)) { + HasPHI |= MI.isPHI(); + CSBBs.insert(MI.getParent()); } - return !HasPHI; + if (!HasPHI) + return true; + return CSBBs.count(MI->getParent()); } void MachineCSE::EnterScope(MachineBasicBlock *MBB) { Index: lib/CodeGen/MachineInstr.cpp =================================================================== --- lib/CodeGen/MachineInstr.cpp +++ lib/CodeGen/MachineInstr.cpp @@ -1302,20 +1302,6 @@ OS << "frame-setup "; if (getFlag(MachineInstr::FrameDestroy)) OS << "frame-destroy "; - if (getFlag(MachineInstr::FmNoNans)) - OS << "nnan "; - if (getFlag(MachineInstr::FmNoInfs)) - OS << "ninf "; - if (getFlag(MachineInstr::FmNsz)) - OS << "nsz "; - if (getFlag(MachineInstr::FmArcp)) - OS << "arcp "; - if (getFlag(MachineInstr::FmContract)) - OS << "contract "; - if (getFlag(MachineInstr::FmAfn)) - OS << "afn "; - if (getFlag(MachineInstr::FmReassoc)) - OS << "reassoc "; // Print the opcode name. if (TII) Index: lib/CodeGen/RegUsageInfoCollector.cpp =================================================================== --- lib/CodeGen/RegUsageInfoCollector.cpp +++ lib/CodeGen/RegUsageInfoCollector.cpp @@ -110,18 +110,19 @@ // Scan all the physical registers. When a register is defined in the current // function set it and all the aliasing registers as defined in the regmask. for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) { + // If a register is in the UsedPhysRegsMask set then mark it as defined. + // All it's aliases will also be in the set, so we can skip setting + // as defined all the aliases here. + if (UsedPhysRegsMask.test(PReg)) { + SetRegAsDefined(PReg); + continue; + } // If a register is defined by an instruction mark it as defined together // with all it's aliases. if (!MRI->def_empty(PReg)) { for (MCRegAliasIterator AI(PReg, TRI, true); AI.isValid(); ++AI) SetRegAsDefined(*AI); - continue; } - // If a register is in the UsedPhysRegsMask set then mark it as defined. - // All clobbered aliases will also be in the set, so we can skip setting - // as defined all the aliases here. - if (UsedPhysRegsMask.test(PReg)) - SetRegAsDefined(PReg); } if (!TargetFrameLowering::isSafeForNoCSROpt(F)) { Index: lib/CodeGen/SelectionDAG/FastISel.cpp =================================================================== --- lib/CodeGen/SelectionDAG/FastISel.cpp +++ lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1444,7 +1444,7 @@ updateValueMap(II, ResultReg); return true; } - case Intrinsic::launder_invariant_group: + case Intrinsic::invariant_group_barrier: case Intrinsic::expect: { unsigned ResultReg = getRegForValue(II->getArgOperand(0)); if (!ResultReg) Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -1053,9 +1053,6 @@ bool occupiesMultipleRegs() const { return std::accumulate(RegCount.begin(), RegCount.end(), 0) > 1; } - - /// Return a list of registers and their sizes. - SmallVector, 4> getRegsAndSizes() const; }; } // end namespace llvm Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -971,20 +971,6 @@ } } -SmallVector, 4> -RegsForValue::getRegsAndSizes() const { - SmallVector, 4> OutVec; - unsigned I = 0; - for (auto CountAndVT : zip_first(RegCount, RegVTs)) { - unsigned RegCount = std::get<0>(CountAndVT); - MVT RegisterVT = std::get<1>(CountAndVT); - unsigned RegisterSize = RegisterVT.getSizeInBits(); - for (unsigned E = I + RegCount; I != E; ++I) - OutVec.push_back(std::make_pair(Regs[I], RegisterSize)); - } - return OutVec; -} - void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa, const TargetLibraryInfo *li) { AA = aa; @@ -4922,18 +4908,26 @@ const auto &TLI = DAG.getTargetLoweringInfo(); RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), VMI->second, V->getType(), isABIRegCopy(V)); - if (RFV.occupiesMultipleRegs()) { + unsigned NumRegs = + std::accumulate(RFV.RegCount.begin(), RFV.RegCount.end(), 0); + if (NumRegs > 1) { + unsigned I = 0; unsigned Offset = 0; - for (auto RegAndSize : RFV.getRegsAndSizes()) { - Op = MachineOperand::CreateReg(RegAndSize.first, false); - auto FragmentExpr = DIExpression::createFragmentExpression( - Expr, Offset, RegAndSize.second); - if (!FragmentExpr) - continue; - FuncInfo.ArgDbgValues.push_back( - BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsDbgDeclare, - Op->getReg(), Variable, *FragmentExpr)); - Offset += RegAndSize.second; + auto RegisterVT = RFV.RegVTs.begin(); + for (auto RegCount : RFV.RegCount) { + unsigned RegisterSize = (RegisterVT++)->getSizeInBits(); + for (unsigned E = I + RegCount; I != E; ++I) { + // The vregs are guaranteed to be allocated in sequence. + Op = MachineOperand::CreateReg(VMI->second + I, false); + auto FragmentExpr = DIExpression::createFragmentExpression( + Expr, Offset, RegisterSize); + if (!FragmentExpr) + continue; + FuncInfo.ArgDbgValues.push_back( + BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsDbgDeclare, + Op->getReg(), Variable, *FragmentExpr)); + Offset += RegisterSize; + } } return true; } @@ -5271,28 +5265,23 @@ RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg, V->getType(), false); if (RFV.occupiesMultipleRegs()) { + unsigned I = 0; unsigned Offset = 0; - unsigned BitsToDescribe = 0; - if (auto VarSize = Variable->getSizeInBits()) - BitsToDescribe = *VarSize; - if (auto Fragment = Expression->getFragmentInfo()) - BitsToDescribe = Fragment->SizeInBits; - for (auto RegAndSize : RFV.getRegsAndSizes()) { - unsigned RegisterSize = RegAndSize.second; - // Bail out if all bits are described already. - if (Offset >= BitsToDescribe) - break; - unsigned FragmentSize = (Offset + RegisterSize > BitsToDescribe) - ? BitsToDescribe - Offset - : RegisterSize; - auto FragmentExpr = DIExpression::createFragmentExpression( - Expression, Offset, FragmentSize); - if (!FragmentExpr) + for (auto CountAndVT : zip_first(RFV.RegCount, RFV.RegVTs)) { + unsigned RegCount = std::get<0>(CountAndVT); + MVT RegisterVT = std::get<1>(CountAndVT); + unsigned RegisterSize = RegisterVT.getSizeInBits(); + for (unsigned E = I + RegCount; I != E; ++I) { + auto FragmentExpr = DIExpression::createFragmentExpression( + Expression, Offset, RegisterSize); + if (!FragmentExpr) continue; - SDV = DAG.getVRegDbgValue(Variable, *FragmentExpr, RegAndSize.first, - false, dl, SDNodeOrder); - DAG.AddDbgValue(SDV, nullptr, false); - Offset += RegisterSize; + // The vregs are guaranteed to be allocated in sequence. + SDV = DAG.getVRegDbgValue(Variable, *FragmentExpr, Reg + I, + false, dl, SDNodeOrder); + DAG.AddDbgValue(SDV, nullptr, false); + Offset += RegisterSize; + } } } else { SDV = DAG.getVRegDbgValue(Variable, Expression, Reg, false, dl, @@ -5746,7 +5735,7 @@ } case Intrinsic::annotation: case Intrinsic::ptr_annotation: - case Intrinsic::launder_invariant_group: + case Intrinsic::invariant_group_barrier: // Drop the intrinsic, but forward the value setValue(&I, getValue(I.getOperand(0))); return nullptr; Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -528,17 +528,6 @@ return true; } } - if (Name.startswith("invariant.group.barrier")) { - // Rename invariant.group.barrier to launder.invariant.group - auto Args = F->getFunctionType()->params(); - Type* ObjectPtr[1] = {Args[0]}; - rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::launder_invariant_group, ObjectPtr); - return true; - - } - break; } case 'm': { Index: lib/IR/DebugInfoMetadata.cpp =================================================================== --- lib/IR/DebugInfoMetadata.cpp +++ lib/IR/DebugInfoMetadata.cpp @@ -830,9 +830,9 @@ case dwarf::DW_OP_LLVM_fragment: { // Make the new offset point into the existing fragment. uint64_t FragmentOffsetInBits = Op.getArg(0); - uint64_t FragmentSizeInBits = Op.getArg(1); - (void)FragmentSizeInBits; - assert((OffsetInBits + SizeInBits <= FragmentSizeInBits) && + // Op.getArg(0) is FragmentOffsetInBits. + // Op.getArg(1) is FragmentSizeInBits. + assert((OffsetInBits + SizeInBits <= Op.getArg(0) + Op.getArg(1)) && "new fragment outside of original fragment"); OffsetInBits += FragmentOffsetInBits; continue; Index: lib/IR/Value.cpp =================================================================== --- lib/IR/Value.cpp +++ lib/IR/Value.cpp @@ -499,7 +499,7 @@ enum PointerStripKind { PSK_ZeroIndices, PSK_ZeroIndicesAndAliases, - PSK_ZeroIndicesAndAliasesAndInvariantGroups, + PSK_ZeroIndicesAndAliasesAndBarriers, PSK_InBoundsConstantIndices, PSK_InBounds }; @@ -518,7 +518,7 @@ if (auto *GEP = dyn_cast(V)) { switch (StripKind) { case PSK_ZeroIndicesAndAliases: - case PSK_ZeroIndicesAndAliasesAndInvariantGroups: + case PSK_ZeroIndicesAndAliasesAndBarriers: case PSK_ZeroIndices: if (!GEP->hasAllZeroIndices()) return V; @@ -546,11 +546,11 @@ V = RV; continue; } - // The result of launder.invariant.group must alias it's argument, + // The result of invariant.group.barrier must alias it's argument, // but it can't be marked with returned attribute, that's why it needs // special case. - if (StripKind == PSK_ZeroIndicesAndAliasesAndInvariantGroups && - CS.getIntrinsicID() == Intrinsic::launder_invariant_group) { + if (StripKind == PSK_ZeroIndicesAndAliasesAndBarriers && + CS.getIntrinsicID() == Intrinsic::invariant_group_barrier) { V = CS.getArgOperand(0); continue; } @@ -576,8 +576,8 @@ return stripPointerCastsAndOffsets(this); } -const Value *Value::stripPointerCastsAndInvariantGroups() const { - return stripPointerCastsAndOffsets( +const Value *Value::stripPointerCastsAndBarriers() const { + return stripPointerCastsAndOffsets( this); } Index: lib/LTO/LTOBackend.cpp =================================================================== --- lib/LTO/LTOBackend.cpp +++ lib/LTO/LTOBackend.cpp @@ -421,15 +421,14 @@ } -static Error +static void finalizeOptimizationRemarks(std::unique_ptr DiagOutputFile) { // Make sure we flush the diagnostic remarks file in case the linker doesn't // call the global destructors before exiting. if (!DiagOutputFile) - return Error::success(); + return; DiagOutputFile->keep(); DiagOutputFile->os().flush(); - return Error::success(); } Error lto::backend(Config &C, AddStreamFn AddStream, @@ -451,8 +450,10 @@ if (!C.CodeGenOnly) { if (!opt(C, TM.get(), 0, *Mod, /*IsThinLTO=*/false, - /*ExportSummary=*/&CombinedIndex, /*ImportSummary=*/nullptr)) - return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + /*ExportSummary=*/&CombinedIndex, /*ImportSummary=*/nullptr)) { + finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + return Error::success(); + } } if (ParallelCodeGenParallelismLevel == 1) { @@ -461,7 +462,8 @@ splitCodeGen(C, TM.get(), AddStream, ParallelCodeGenParallelismLevel, std::move(Mod)); } - return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + return Error::success(); } static void dropDeadSymbols(Module &Mod, const GVSummaryMapTy &DefinedGlobals, @@ -496,20 +498,13 @@ std::unique_ptr TM = createTargetMachine(Conf, *TOrErr, Mod); - // Setup optimization remarks. - auto DiagFileOrErr = lto::setupOptimizationRemarks( - Mod.getContext(), Conf.RemarksFilename, Conf.RemarksWithHotness, Task); - if (!DiagFileOrErr) - return DiagFileOrErr.takeError(); - auto DiagnosticOutputFile = std::move(*DiagFileOrErr); - if (Conf.CodeGenOnly) { codegen(Conf, TM.get(), AddStream, Task, Mod); - return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + return Error::success(); } if (Conf.PreOptModuleHook && !Conf.PreOptModuleHook(Task, Mod)) - return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + return Error::success(); renameModuleForThinLTO(Mod, CombinedIndex); @@ -518,14 +513,14 @@ thinLTOResolveWeakForLinkerModule(Mod, DefinedGlobals); if (Conf.PostPromoteModuleHook && !Conf.PostPromoteModuleHook(Task, Mod)) - return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + return Error::success(); if (!DefinedGlobals.empty()) thinLTOInternalizeModule(Mod, DefinedGlobals); if (Conf.PostInternalizeModuleHook && !Conf.PostInternalizeModuleHook(Task, Mod)) - return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + return Error::success(); auto ModuleLoader = [&](StringRef Identifier) { assert(Mod.getContext().isODRUniquingDebugTypes() && @@ -542,12 +537,12 @@ return Err; if (Conf.PostImportModuleHook && !Conf.PostImportModuleHook(Task, Mod)) - return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + return Error::success(); if (!opt(Conf, TM.get(), Task, Mod, /*IsThinLTO=*/true, /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex)) - return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + return Error::success(); codegen(Conf, TM.get(), AddStream, Task, Mod); - return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + return Error::success(); } Index: lib/MC/MCContext.cpp =================================================================== --- lib/MC/MCContext.cpp +++ lib/MC/MCContext.cpp @@ -516,10 +516,8 @@ StringRef CachedName = Entry.first.SectionName; MCSymbol *Begin = nullptr; - if (BeginSymName) { - Begin = createSymbol(BeginSymName, false, false); - cast(Begin)->setType(wasm::WASM_SYMBOL_TYPE_SECTION); - } + if (BeginSymName) + Begin = createTempSymbol(BeginSymName, false); MCSectionWasm *Result = new (WasmAllocator.Allocate()) MCSectionWasm(CachedName, Kind, GroupSym, UniqueID, Begin); Index: lib/MC/MCMachOStreamer.cpp =================================================================== --- lib/MC/MCMachOStreamer.cpp +++ lib/MC/MCMachOStreamer.cpp @@ -455,7 +455,30 @@ // We have to set the fragment atom associations so we can relax properly for // Mach-O. - addFragmentAtoms(); + + // First, scan the symbol table to build a lookup table from fragments to + // defining symbols. + DenseMap DefiningSymbolMap; + for (const MCSymbol &Symbol : getAssembler().symbols()) { + if (getAssembler().isSymbolLinkerVisible(Symbol) && Symbol.isInSection() && + !Symbol.isVariable()) { + // An atom defining symbol should never be internal to a fragment. + assert(Symbol.getOffset() == 0 && + "Invalid offset in atom defining symbol!"); + DefiningSymbolMap[Symbol.getFragment()] = &Symbol; + } + } + + // Set the fragment atom associations by tracking the last seen atom defining + // symbol. + for (MCSection &Sec : getAssembler()) { + const MCSymbol *CurrentAtom = nullptr; + for (MCFragment &Frag : Sec) { + if (const MCSymbol *Symbol = DefiningSymbolMap.lookup(&Frag)) + CurrentAtom = Symbol; + Frag.setAtom(CurrentAtom); + } + } this->MCObjectStreamer::FinishImpl(); } Index: lib/MC/MCObjectFileInfo.cpp =================================================================== --- lib/MC/MCObjectFileInfo.cpp +++ lib/MC/MCObjectFileInfo.cpp @@ -866,19 +866,19 @@ DataSection = Ctx->getWasmSection(".data", SectionKind::getData()); // TODO: Set the section types and flags. - DwarfLineSection = Ctx->getWasmSection(".debug_line", SectionKind::getMetadata(), ".debug_line"); + DwarfLineSection = Ctx->getWasmSection(".debug_line", SectionKind::getMetadata()); DwarfLineStrSection = Ctx->getWasmSection(".debug_line_str", SectionKind::getMetadata()); - DwarfStrSection = Ctx->getWasmSection(".debug_str", SectionKind::getMetadata(), ".debug_str"); + DwarfStrSection = Ctx->getWasmSection(".debug_str", SectionKind::getMetadata()); DwarfLocSection = Ctx->getWasmSection(".debug_loc", SectionKind::getMetadata()); - DwarfAbbrevSection = Ctx->getWasmSection(".debug_abbrev", SectionKind::getMetadata(), ".section_abbrev"); + DwarfAbbrevSection = Ctx->getWasmSection(".debug_abbrev", SectionKind::getMetadata(), "section_abbrev"); DwarfARangesSection = Ctx->getWasmSection(".debug_aranges", SectionKind::getMetadata()); - DwarfRangesSection = Ctx->getWasmSection(".debug_ranges", SectionKind::getMetadata(), ".debug_range"); - DwarfMacinfoSection = Ctx->getWasmSection(".debug_macinfo", SectionKind::getMetadata(), ".debug_macinfo"); + DwarfRangesSection = Ctx->getWasmSection(".debug_ranges", SectionKind::getMetadata(), "debug_range"); + DwarfMacinfoSection = Ctx->getWasmSection(".debug_macinfo", SectionKind::getMetadata(), "debug_macinfo"); DwarfAddrSection = Ctx->getWasmSection(".debug_addr", SectionKind::getMetadata()); DwarfCUIndexSection = Ctx->getWasmSection(".debug_cu_index", SectionKind::getMetadata()); DwarfTUIndexSection = Ctx->getWasmSection(".debug_tu_index", SectionKind::getMetadata()); - DwarfInfoSection = Ctx->getWasmSection(".debug_info", SectionKind::getMetadata(), ".debug_info"); + DwarfInfoSection = Ctx->getWasmSection(".debug_info", SectionKind::getMetadata(), "section_info"); DwarfFrameSection = Ctx->getWasmSection(".debug_frame", SectionKind::getMetadata()); DwarfPubNamesSection = Ctx->getWasmSection(".debug_pubnames", SectionKind::getMetadata()); DwarfPubTypesSection = Ctx->getWasmSection(".debug_pubtypes", SectionKind::getMetadata()); Index: lib/MC/MCObjectStreamer.cpp =================================================================== --- lib/MC/MCObjectStreamer.cpp +++ lib/MC/MCObjectStreamer.cpp @@ -60,32 +60,6 @@ PendingLabels.clear(); } -void MCObjectStreamer::addFragmentAtoms() { - // First, scan the symbol table to build a lookup table from fragments to - // defining symbols. - DenseMap DefiningSymbolMap; - for (const MCSymbol &Symbol : getAssembler().symbols()) { - if (getAssembler().isSymbolLinkerVisible(Symbol) && Symbol.isInSection() && - !Symbol.isVariable()) { - // An atom defining symbol should never be internal to a fragment. - assert(Symbol.getOffset() == 0 && - "Invalid offset in atom defining symbol!"); - DefiningSymbolMap[Symbol.getFragment()] = &Symbol; - } - } - - // Set the fragment atom associations by tracking the last seen atom defining - // symbol. - for (MCSection &Sec : getAssembler()) { - const MCSymbol *CurrentAtom = nullptr; - for (MCFragment &Frag : Sec) { - if (const MCSymbol *Symbol = DefiningSymbolMap.lookup(&Frag)) - CurrentAtom = Symbol; - Frag.setAtom(CurrentAtom); - } - } -} - // As a compile-time optimization, avoid allocating and evaluating an MCExpr // tree for (Hi - Lo) when Hi and Lo are offsets into the same fragment. static Optional absoluteSymbolDiff(const MCSymbol *Hi, Index: lib/MC/MCWasmStreamer.cpp =================================================================== --- lib/MC/MCWasmStreamer.cpp +++ lib/MC/MCWasmStreamer.cpp @@ -156,8 +156,17 @@ } void MCWasmStreamer::EmitIdent(StringRef IdentString) { - // TODO(sbc): Add the ident section once we support mergable strings - // sections in the object format + MCSection *Comment = getAssembler().getContext().getWasmSection( + ".comment", SectionKind::getMetadata()); + PushSection(); + SwitchSection(Comment); + if (!SeenIdent) { + EmitIntValue(0, 1); + SeenIdent = true; + } + EmitBytes(IdentString); + EmitIntValue(0, 1); + PopSection(); } void MCWasmStreamer::EmitInstToFragment(const MCInst &Inst, @@ -189,9 +198,6 @@ void MCWasmStreamer::FinishImpl() { EmitFrames(nullptr); - // Set fragment atoms so we can map from code fragment to defining symbol - addFragmentAtoms(); - this->MCObjectStreamer::FinishImpl(); } Index: lib/MC/WasmObjectWriter.cpp =================================================================== --- lib/MC/WasmObjectWriter.cpp +++ lib/MC/WasmObjectWriter.cpp @@ -244,6 +244,8 @@ DenseMap WasmIndices; // Maps data symbols to the Wasm segment and offset/size with the segment. DenseMap DataLocations; + // Maps section symbols to the section. + DenseMap CustomSectionSymbols; // Stores output data (index, relocations, content offset) for custom // section. @@ -293,6 +295,7 @@ FunctionTypes.clear(); Globals.clear(); DataSegments.clear(); + CustomSectionSymbols.clear(); MCObjectWriter::reset(); NumFunctionImports = 0; NumGlobalImports = 0; @@ -430,6 +433,11 @@ if (FixupSection.getSectionName().startswith(".init_array")) return; + // TODO: Add support for non-debug metadata sections? + if (FixupSection.getKind().isMetadata() && + !FixupSection.getSectionName().startswith(".debug_")) + return; + if (const MCSymbolRefExpr *RefB = Target.getSymB()) { assert(RefB->getKind() == MCSymbolRefExpr::VK_None && "Should not have constructed this"); @@ -487,50 +495,36 @@ // be negative and don't wrap. FixedValue = 0; - unsigned Type = getRelocType(Target, Fixup); + if (SymA) + SymA->setUsedInReloc(); + assert(!IsPCRel); assert(SymA); - // Absolute offset within a section or a function. - // Currently only supported for for metadata sections. - // See: test/MC/WebAssembly/blockaddress.ll - if (Type == wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32 || - Type == wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32) { - if (!FixupSection.getKind().isMetadata()) - report_fatal_error("relocations for function or section offsets are " - "only supported in metadata sections"); - - const MCSymbol *SectionSymbol = nullptr; - const MCSection &SecA = SymA->getSection(); - if (SecA.getKind().isText()) - SectionSymbol = SecA.begin()->getAtom(); - else - SectionSymbol = SecA.getBeginSymbol(); - if (!SectionSymbol) - report_fatal_error("section symbol is required for relocation"); - - C += Layout.getSymbolOffset(*SymA); - SymA = cast(SectionSymbol); - } - - // Relocation other than R_WEBASSEMBLY_TYPE_INDEX_LEB are required to be - // against a named symbol. - if (Type != wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB) { - if (SymA->getName().empty()) - report_fatal_error("relocations against un-named temporaries are not yet " - "supported by wasm"); - - SymA->setUsedInReloc(); - } + unsigned Type = getRelocType(Target, Fixup); WasmRelocationEntry Rec(FixupOffset, SymA, C, Type, &FixupSection); DEBUG(dbgs() << "WasmReloc: " << Rec << "\n"); + // Relocation other than R_WEBASSEMBLY_TYPE_INDEX_LEB, + // R_WEBASSEMBLY_SECTION_OFFSET_I32 or R_WEBASSEMBLY_FUNCTION_OFFSET_I32 + // are currently required to be against a named symbol. + // TODO(sbc): Add support for relocations against unnamed temporaries such + // as those generated by llvm's `blockaddress`. + // See: test/MC/WebAssembly/blockaddress.ll + if (SymA->getName().empty() && + !(Type == wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB || + Type == wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32 || + Type == wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32)) + report_fatal_error("relocations against un-named temporaries are not yet " + "supported by wasm"); + if (FixupSection.isWasmData()) { DataRelocations.push_back(Rec); } else if (FixupSection.getKind().isText()) { CodeRelocations.push_back(Rec); } else if (FixupSection.getKind().isMetadata()) { + assert(FixupSection.getSectionName().startswith(".debug_")); CustomSectionsRelocations[&FixupSection].push_back(Rec); } else { llvm_unreachable("unexpected section type"); @@ -597,12 +591,15 @@ report_fatal_error("symbol not found in wasm index space: " + RelEntry.Symbol->getName()); return WasmIndices[RelEntry.Symbol]; - case wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32: - case wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32: { + case wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32: { const auto &Section = static_cast(RelEntry.Symbol->getSection()); return Section.getSectionOffset() + RelEntry.Addend; } + case wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32: { + const auto &Section = *CustomSectionSymbols.find(RelEntry.Symbol)->second; + return Section.getSectionOffset() + RelEntry.Addend; + } case wasm::R_WEBASSEMBLY_MEMORY_ADDR_LEB: case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32: case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB: { @@ -1036,10 +1033,49 @@ CustomSection.OutputIndex = Section.Index; endSection(Section); + } +} + +void WasmObjectWriter::updateCustomSectionRelocations( + const SmallVector &Functions, const MCAsmLayout &Layout) { + std::map SectionSymbols; + for (const auto &P : CustomSectionSymbols) + SectionSymbols[P.second] = P.first; + std::map FuncSymbols; + for (const auto &FuncInfo : Functions) + FuncSymbols[&FuncInfo.Sym->getSection()] = FuncInfo.Sym; + + // Patch relocation records for R_WEBASSEMBLY_FUNCTION_OFFSET_I32 and + // R_WEBASSEMBLY_SECTION_OFFSET_I32. The Addend is stuffed the offset from + // the beginning of the function or custom section -- all such relocations + // target the function or custom section starts. + for (auto &Section : CustomSections) { + auto &Relocations = CustomSectionsRelocations[Section.Section]; + for (WasmRelocationEntry &RelEntry : Relocations) { + switch (RelEntry.Type) { + case wasm::R_WEBASSEMBLY_FUNCTION_OFFSET_I32: { + assert(RelEntry.hasAddend()); + auto &Section = + static_cast(RelEntry.Symbol->getSection()); + RelEntry.Addend += Layout.getSymbolOffset(*RelEntry.Symbol); + RelEntry.Symbol = FuncSymbols[&Section]; + break; + } + case wasm::R_WEBASSEMBLY_SECTION_OFFSET_I32: { + assert(RelEntry.hasAddend()); + auto &Section = + static_cast(RelEntry.Symbol->getSection()); + RelEntry.Addend += Layout.getSymbolOffset(*RelEntry.Symbol); + RelEntry.Symbol = SectionSymbols[&Section]; + break; + } + default: + break; + } + } // Apply fixups. - auto &Relocations = CustomSectionsRelocations[CustomSection.Section]; - applyRelocations(Relocations, CustomSection.OutputContentsOffset); + applyRelocations(Relocations, Section.OutputContentsOffset); } } @@ -1068,25 +1104,6 @@ return Pair.first->second; } -static bool isInSymtab(const MCSymbolWasm &Sym) { - if (Sym.isUsedInReloc()) - return true; - - if (Sym.isComdat() && !Sym.isDefined()) - return false; - - if (Sym.isTemporary() && Sym.getName().empty()) - return false; - - if (Sym.isTemporary() && Sym.isData() && !Sym.getSize()) - return false; - - if (Sym.isSection()) - return false; - - return true; -} - void WasmObjectWriter::writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) { DEBUG(dbgs() << "WasmObjectWriter::writeObject\n"); @@ -1164,53 +1181,76 @@ } } - // Populate DataSegments and CustomSections, which must be done before - // populating DataLocations. + // Populate DataSegments, which must be done before populating DataLocations. for (MCSection &Sec : Asm) { auto &Section = static_cast(Sec); - StringRef SectionName = Section.getSectionName(); - // .init_array sections are handled specially elsewhere. - if (SectionName.startswith(".init_array")) + if (Section.getSectionName().startswith(".custom_section.")) { + if (Section.getFragmentList().empty()) + continue; + if (Section.getFragmentList().size() != 1) + report_fatal_error( + "only one .custom_section section fragment supported"); + const MCFragment &Frag = *Section.begin(); + if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data) + report_fatal_error("only data supported in .custom_section section"); + const auto &DataFrag = cast(Frag); + if (!DataFrag.getFixups().empty()) + report_fatal_error("fixups not supported in .custom_section section"); + StringRef UserName = Section.getSectionName().substr(16); + CustomSections.emplace_back(UserName, &Section); continue; + } - // Code is handled separately - if (Section.getKind().isText()) + if (!Section.isWasmData()) continue; - if (Section.isWasmData()) { - uint32_t SegmentIndex = DataSegments.size(); - DataSize = alignTo(DataSize, Section.getAlignment()); - DataSegments.emplace_back(); - WasmDataSegment &Segment = DataSegments.back(); - Segment.Name = SectionName; - Segment.Offset = DataSize; - Segment.Section = &Section; - addData(Segment.Data, Section); - Segment.Alignment = Section.getAlignment(); - Segment.Flags = 0; - DataSize += Segment.Data.size(); - Section.setSegmentIndex(SegmentIndex); - - if (const MCSymbolWasm *C = Section.getGroup()) { - Comdats[C->getName()].emplace_back( - WasmComdatEntry{wasm::WASM_COMDAT_DATA, SegmentIndex}); - } - } else { - // Create custom sections - assert(Sec.getKind().isMetadata()); + // .init_array sections are handled specially elsewhere. + if (cast(Sec).getSectionName().startswith(".init_array")) + continue; - StringRef Name = SectionName; + uint32_t SegmentIndex = DataSegments.size(); + DataSize = alignTo(DataSize, Section.getAlignment()); + DataSegments.emplace_back(); + WasmDataSegment &Segment = DataSegments.back(); + Segment.Name = Section.getSectionName(); + Segment.Offset = DataSize; + Segment.Section = &Section; + addData(Segment.Data, Section); + Segment.Alignment = Section.getAlignment(); + Segment.Flags = 0; + DataSize += Segment.Data.size(); + Section.setSegmentIndex(SegmentIndex); + + if (const MCSymbolWasm *C = Section.getGroup()) { + Comdats[C->getName()].emplace_back( + WasmComdatEntry{wasm::WASM_COMDAT_DATA, SegmentIndex}); + } + } - // For user-defined custom sections, strip the prefix - if (Name.startswith(".custom_section.")) - Name = Name.substr(strlen(".custom_section.")); + // Create symbols for debug/custom sections. + for (MCSection &Sec : Asm) { + auto &DebugSection = static_cast(Sec); + StringRef SectionName = DebugSection.getSectionName(); - MCSymbol* Begin = Sec.getBeginSymbol(); - if (Begin) - WasmIndices[cast(Begin)] = CustomSections.size(); - CustomSections.emplace_back(Name, &Section); - } + // TODO: Add support for non-debug metadata sections? + if (!Sec.getKind().isMetadata() || !SectionName.startswith(".debug_")) + continue; + + uint32_t ElementIndex = CustomSections.size(); + CustomSections.emplace_back(SectionName, &DebugSection); + + MCSymbolWasm *SectionSym = + cast(Ctx.getOrCreateSymbol(SectionName)); + CustomSectionSymbols[SectionSym] = &DebugSection; + + wasm::WasmSymbolInfo Info; + Info.Name = SectionSym->getName(); + Info.Kind = wasm::WASM_SYMBOL_TYPE_SECTION; + Info.Flags = wasm::WASM_SYMBOL_BINDING_LOCAL; + Info.ElementIndex = ElementIndex; + SymbolIndices[SectionSym] = SymbolInfos.size(); + SymbolInfos.emplace_back(Info); } // Populate WasmIndices and DataLocations for defined symbols. @@ -1340,9 +1380,12 @@ // Finally, populate the symbol table itself, in its "natural" order. for (const MCSymbol &S : Asm.symbols()) { const auto &WS = static_cast(S); - if (!isInSymtab(WS)) + if (WS.isTemporary() && WS.getName().empty()) + continue; + if (WS.isComdat() && !WS.isDefined()) + continue; + if (WS.isTemporary() && WS.isData() && !WS.getSize()) continue; - DEBUG(dbgs() << "adding to symtab: " << WS << "\n"); uint32_t Flags = 0; if (WS.isWeak()) @@ -1358,13 +1401,10 @@ Info.Name = WS.getName(); Info.Kind = WS.getType(); Info.Flags = Flags; - if (!WS.isData()) { - assert(WasmIndices.count(&WS) > 0); + if (!WS.isData()) Info.ElementIndex = WasmIndices.find(&WS)->second; - } else if (WS.isDefined()) { - assert(DataLocations.count(&WS) > 0); + else if (WS.isDefined()) Info.DataRef = DataLocations.find(&WS)->second; - } SymbolIndices[&WS] = SymbolInfos.size(); SymbolInfos.emplace_back(Info); } @@ -1415,13 +1455,10 @@ if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data) report_fatal_error("only data supported in .init_array section"); uint16_t Priority = UINT16_MAX; - unsigned PrefixLength = strlen(".init_array"); - if (WS.getSectionName().size() > PrefixLength) { - if (WS.getSectionName()[PrefixLength] != '.') + if (WS.getSectionName().size() != 11) { + if (WS.getSectionName()[11] != '.') report_fatal_error(".init_array section priority should start with '.'"); - if (WS.getSectionName() - .substr(PrefixLength + 1) - .getAsInteger(10, Priority)) + if (WS.getSectionName().substr(12).getAsInteger(10, Priority)) report_fatal_error("invalid .init_array section priority"); } const auto &DataFrag = cast(Frag); @@ -1462,6 +1499,7 @@ writeCodeSection(Asm, Layout, Functions); writeDataSection(); writeCustomSections(Asm, Layout); + updateCustomSectionRelocations(Functions, Layout); writeLinkingMetaDataSection(SymbolInfos, InitFuncs, Comdats); writeRelocSection(CodeSectionIndex, "CODE", CodeRelocations); writeRelocSection(DataSectionIndex, "DATA", DataRelocations); Index: lib/Support/RWMutex.cpp =================================================================== --- lib/Support/RWMutex.cpp +++ lib/Support/RWMutex.cpp @@ -13,7 +13,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/RWMutex.h" -#include "llvm/Config/config.h" +#include "llvm/Config/llvm-config.h" //===----------------------------------------------------------------------===// //=== WARNING: Implementation here must contain only TRULY operating system Index: lib/Support/Unix/Path.inc =================================================================== --- lib/Support/Unix/Path.inc +++ lib/Support/Unix/Path.inc @@ -365,9 +365,6 @@ #elif defined(__CYGWIN__) // Cygwin doesn't expose this information; would need to use Win32 API. return false; -#elif defined(__Fuchsia__) - // Fuchsia doesn't yet support remote filesystem mounts. - return true; #elif defined(__sun) // statvfs::f_basetype contains a null-terminated FSType name of the mounted target StringRef fstype(Vfs.f_basetype); Index: lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- lib/Target/AArch64/AArch64InstrFormats.td +++ lib/Target/AArch64/AArch64InstrFormats.td @@ -6062,8 +6062,7 @@ def #NAME#32 : BaseSIMDThreeScalar; let Predicates = [HasNEON, HasFullFP16] in { - def #NAME#16 : BaseSIMDThreeScalar; + def #NAME#16 : BaseSIMDThreeScalar; } // Predicates = [HasNEON, HasFullFP16] } @@ -7891,12 +7890,35 @@ multiclass SIMDFPScalarRShift opc, string asm> { + let Predicates = [HasNEON, HasFullFP16] in { - def h : BaseSIMDScalarShift { + def SHr : BaseSIMDScalarShift { + let Inst{19-16} = imm{3-0}; + } + } // Predicates = [HasNEON, HasFullFP16] + + let Predicates = [HasNEON, HasFullFP16] in { + def HSr : BaseSIMDScalarShift { let Inst{19-16} = imm{3-0}; + let Inst{23-22} = 0b11; } } // Predicates = [HasNEON, HasFullFP16] + + def HDr : BaseSIMDScalarShift { + let Inst{21-16} = imm{5-0}; + let Inst{23-22} = 0b11; + } + + def DHr : BaseSIMDScalarShift { + let Inst{21-16} = imm{5-0}; + let Inst{23-22} = 0b11; + let Inst{31} = 1; + } + def s : BaseSIMDScalarShift { let Inst{20-16} = imm{4-0}; @@ -7906,6 +7928,13 @@ FPR64, FPR64, vecshiftR64, asm, []> { let Inst{21-16} = imm{5-0}; } + + let Predicates = [HasNEON, HasFullFP16] in { + def h : BaseSIMDScalarShift { + let Inst{19-16} = imm{3-0}; + } + } // Predicates = [HasNEON, HasFullFP16] } multiclass SIMDScalarRShiftD opc, string asm, Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -4883,17 +4883,29 @@ def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn), vecshiftR64:$imm)), (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>; -def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm), - (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>; +def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR32:$imm)), + (FCVTZSHDr (i64 FPR64:$Rn), vecshiftR32:$imm)>; +def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxu FPR16:$Rn, vecshiftR32:$imm)), + (FCVTZUSHr FPR16:$Rn, vecshiftR32:$imm)>; +def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxs FPR16:$Rn, vecshiftR32:$imm)), + (FCVTZSSHr FPR16:$Rn, vecshiftR32:$imm)>; +def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR64:$imm)), + (FCVTZSDHr (f16 FPR16:$Rn), vecshiftR64:$imm)>; +def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR16:$imm)), + (UCVTFHSr FPR32:$Rn, vecshiftR16:$imm)>; def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm), (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>; -def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)), - (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>; def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)), (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>; def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn), vecshiftR64:$imm)), (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 FPR32:$Rn), vecshiftR16:$imm)), + (SCVTFHSr FPR32:$Rn, vecshiftR16:$imm)>; +def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR16:$imm)), + (SCVTFHSr FPR32:$Rn, vecshiftR16:$imm)>; +def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)), + (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>; def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn), vecshiftR64:$imm)), (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>; Index: lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp =================================================================== --- lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp +++ lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -96,18 +96,9 @@ } return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12L; - case AArch64::fixup_aarch64_pcrel_adr_imm21: - return COFF::IMAGE_REL_ARM64_REL21; - case AArch64::fixup_aarch64_pcrel_adrp_imm21: return COFF::IMAGE_REL_ARM64_PAGEBASE_REL21; - case AArch64::fixup_aarch64_pcrel_branch14: - return COFF::IMAGE_REL_ARM64_BRANCH14; - - case AArch64::fixup_aarch64_pcrel_branch19: - return COFF::IMAGE_REL_ARM64_BRANCH19; - case AArch64::fixup_aarch64_pcrel_branch26: case AArch64::fixup_aarch64_pcrel_call26: return COFF::IMAGE_REL_ARM64_BRANCH26; Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -454,7 +454,7 @@ case Intrinsic::lifetime_end: case Intrinsic::invariant_start: case Intrinsic::invariant_end: - case Intrinsic::launder_invariant_group: + case Intrinsic::invariant_group_barrier: case Intrinsic::objectsize: return true; default: @@ -878,7 +878,7 @@ } case Intrinsic::invariant_start: case Intrinsic::invariant_end: - case Intrinsic::launder_invariant_group: + case Intrinsic::invariant_group_barrier: Intr->eraseFromParent(); // FIXME: I think the invariant marker should still theoretically apply, // but the intrinsics need to be changed to accept pointers with any Index: lib/Target/AMDGPU/AMDGPURegisterInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -31,7 +31,7 @@ /// \returns the sub reg enum value for the given \p Channel /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) - static unsigned getSubRegFromChannel(unsigned Channel); + unsigned getSubRegFromChannel(unsigned Channel) const; void reserveRegisterTuples(BitVector &, unsigned Reg) const; }; Index: lib/Target/AMDGPU/AMDGPURegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -25,7 +25,7 @@ // they are not supported at this time. //===----------------------------------------------------------------------===// -unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) { +unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { static const unsigned SubRegs[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, Index: lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp =================================================================== --- lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -308,7 +308,7 @@ DstMI = Reg; else DstMI = TRI->getMatchingSuperReg(Reg, - AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)), + TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), &AMDGPU::R600_Reg128RegClass); } if (MO.isUse()) { @@ -317,7 +317,7 @@ SrcMI = Reg; else SrcMI = TRI->getMatchingSuperReg(Reg, - AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)), + TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), &AMDGPU::R600_Reg128RegClass); } } Index: lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp =================================================================== --- lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -218,13 +218,13 @@ } } if (IsReduction) { - unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan); + unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); Src0 = TRI.getSubReg(Src0, SubRegIndex); Src1 = TRI.getSubReg(Src1, SubRegIndex); } else if (IsCube) { static const int CubeSrcSwz[] = {2, 2, 0, 1}; - unsigned SubRegIndex0 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[Chan]); - unsigned SubRegIndex1 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[3 - Chan]); + unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]); + unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]); Src1 = TRI.getSubReg(Src0, SubRegIndex1); Src0 = TRI.getSubReg(Src0, SubRegIndex0); } @@ -233,7 +233,7 @@ bool Mask = false; bool NotLast = true; if (IsCube) { - unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan); + unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); DstReg = TRI.getSubReg(DstReg, SubRegIndex); } else { // Mask the write if the original instruction does not write to Index: lib/Target/AMDGPU/R600InstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/R600InstrInfo.cpp +++ lib/Target/AMDGPU/R600InstrInfo.cpp @@ -72,7 +72,7 @@ if (VectorComponents > 0) { for (unsigned I = 0; I < VectorComponents; I++) { - unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(I); + unsigned SubRegIndex = RI.getSubRegFromChannel(I); buildDefaultInstruction(MBB, MI, AMDGPU::MOV, RI.getSubReg(DestReg, SubRegIndex), RI.getSubReg(SrcReg, SubRegIndex)) Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -6755,7 +6755,7 @@ return SDValue(); } - if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) + if (VT != MVT::i32) return SDValue(); // add x, zext (setcc) => addcarry x, 0, setcc Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -579,10 +579,6 @@ void finalizeLowering(MachineFunction &MF) const override; - /// Return the correct alignment for the current calling convention. - unsigned getABIAlignmentForCallingConv(Type *ArgTy, - DataLayout DL) const override; - protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -14783,18 +14783,6 @@ return (Members > 0 && Members <= 4); } -/// Return the correct alignment for the current calling convention. -unsigned -ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, - DataLayout DL) const { - if (!ArgTy->isVectorTy()) - return DL.getABITypeAlignment(ArgTy); - - // Avoid over-aligning vector parameters. It would require realigning the - // stack and waste space for no real benefit. - return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment()); -} - /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when /// passing according to AAPCS rules. Index: lib/Target/ARM/ARMScheduleA9.td =================================================================== --- lib/Target/ARM/ARMScheduleA9.td +++ lib/Target/ARM/ARMScheduleA9.td @@ -1996,15 +1996,15 @@ // Reserve A9UnitFP for 2 consecutive cycles. def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; - let ResourceCycles = [2, 1]; + let ResourceCycles = [2]; } def A9Write2V7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; - let ResourceCycles = [2, 1]; + let ResourceCycles = [2]; } def A9Write2V9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; - let ResourceCycles = [2, 1]; + let ResourceCycles = [2]; } // Branches don't have a def operand but still consume resources. Index: lib/Target/PowerPC/PPCCTRLoops.cpp =================================================================== --- lib/Target/PowerPC/PPCCTRLoops.cpp +++ lib/Target/PowerPC/PPCCTRLoops.cpp @@ -30,10 +30,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -514,12 +512,6 @@ if (MadeChange) return MadeChange; - // Bail out if the loop has irreducible control flow. - LoopBlocksRPO RPOT(L); - RPOT.perform(LI); - if (containsIrreducibleCFG(RPOT, *LI)) - return false; - #ifndef NDEBUG // Stop trying after reaching the limit (if any). int Limit = CTRLoopLimit; @@ -580,12 +572,6 @@ if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32)) continue; - // If this exiting block is contained in a nested loop, it is not eligible - // for insertion of the branch-and-decrement since the inner loop would - // end up messing up the value in the CTR. - if (LI->getLoopFor(*I) != L) - continue; - // We now have a loop-invariant count of loop iterations (which is not the // constant zero) for which we know that this loop will not exit via this // exisiting block. Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -1101,7 +1101,6 @@ // tail call. This will cause the optimizers to attempt to move, or // duplicate return instructions to help enable tail call optimizations. bool mayBeEmittedAsTailCall(const CallInst *CI) const override; - bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; }; // end class PPCTargetLowering namespace PPC { Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -13978,20 +13978,3 @@ // If the function is local then we have a good chance at tail-calling it return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee); } - -bool PPCTargetLowering:: -isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const { - const Value *Mask = AndI.getOperand(1); - // If the mask is suitable for andi. or andis. we should sink the and. - if (const ConstantInt *CI = dyn_cast(Mask)) { - // Can't handle constants wider than 64-bits. - if (CI->getBitWidth() > 64) - return false; - int64_t ConstVal = CI->getZExtValue(); - return isUInt<16>(ConstVal) || - (isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF)); - } - - // For non-constant masks, we can always use the record-form and. - return true; -} Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -4507,7 +4507,7 @@ defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SchedWriteVecIMul, HasBWI, 1>, T8PD; defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, - SchedWriteVecALU, HasBWI, 1>; + SchedWriteVecIMul, HasBWI, 1>; defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq, SchedWriteVecIMul, HasAVX512, 1>, T8PD; defm VPMULUDQ : avx512_binop_rm_vl_q<0xF4, "vpmuludq", X86pmuludq, @@ -5587,24 +5587,24 @@ } defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli, - SchedWriteVecShiftImm>, + SchedWriteVecShift>, avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli, - SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; + SchedWriteVecShift>, AVX512BIi8Base, EVEX_4V; defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli, - SchedWriteVecShiftImm>, + SchedWriteVecShift>, avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli, - SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; + SchedWriteVecShift>, AVX512BIi8Base, EVEX_4V; defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai, - SchedWriteVecShiftImm>, + SchedWriteVecShift>, avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai, - SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; + SchedWriteVecShift>, AVX512BIi8Base, EVEX_4V; defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri, - SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; + SchedWriteVecShift>, AVX512BIi8Base, EVEX_4V; defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli, - SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; + SchedWriteVecShift>, AVX512BIi8Base, EVEX_4V; defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl, SchedWriteVecShift>; @@ -9711,7 +9711,7 @@ } defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw", - SchedWritePSADBW, avx512vl_i16_info, avx512vl_i8_info>, + SchedWriteVecIMul, avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>; multiclass avx512_unary_rm opc, string OpcodeStr, SDNode OpNode, @@ -9842,8 +9842,9 @@ } } +// FIXME: Is there a better scheduler class for VPLZCNT? defm VPLZCNT : avx512_unary_rm_vl_dq<0x44, 0x44, "vplzcnt", ctlz, - SchedWriteVecIMul, HasCDI>; + SchedWriteVecALU, HasCDI>; // FIXME: Is there a better scheduler class for VPCONFLICT? defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, Index: lib/Target/X86/X86InstrMMX.td =================================================================== --- lib/Target/X86/X86InstrMMX.td +++ lib/Target/X86/X86InstrMMX.td @@ -52,8 +52,7 @@ multiclass MMXI_binop_rmi_int opc, bits<8> opc2, Format ImmForm, string OpcodeStr, Intrinsic IntId, - Intrinsic IntId2, X86FoldableSchedWrite sched, - X86FoldableSchedWrite schedImm> { + Intrinsic IntId2, X86FoldableSchedWrite sched> { def rr : MMXI, - Sched<[schedImm]>; + Sched<[sched]>; } } @@ -110,19 +109,18 @@ } /// PALIGN MMX instructions (require SSSE3). -multiclass ssse3_palign_mm { +multiclass ssse3_palign_mm { def rri : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>, - Sched<[sched]>; + Sched<[WriteShuffle]>; def rmi : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[WriteShuffleLd, ReadAfterLd]>; } multiclass sse12_cvt_pint opc, RegisterClass SrcRC, RegisterClass DstRC, @@ -287,228 +285,218 @@ // Arithmetic Instructions defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d, - SchedWriteVecALU.MMX>; + WriteVecALU>; // -- Addition defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; let Predicates = [HasSSE2] in defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w, - SchedWritePHAdd.MMX>; + WritePHAdd>; defm MMX_PHADDD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d, - SchedWritePHAdd.MMX>; + WritePHAdd>; defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw, - SchedWritePHAdd.MMX>; + WritePHAdd>; // -- Subtraction defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d, - SchedWriteVecALU.MMX>; + WriteVecALU>; let Predicates = [HasSSE2] in defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w, - SchedWritePHAdd.MMX>; + WritePHAdd>; defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d, - SchedWritePHAdd.MMX>; + WritePHAdd>; defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw, - SchedWritePHAdd.MMX>; + WritePHAdd>; // -- Multiplication defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, - SchedWriteVecIMul.MMX, 1>; + WriteVecIMul, 1>; defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, - SchedWriteVecIMul.MMX, 1>; + WriteVecIMul, 1>; let Predicates = [HasSSE1] in defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, - SchedWriteVecIMul.MMX, 1>; + WriteVecIMul, 1>; let Predicates = [HasSSE2] in defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, - SchedWriteVecIMul.MMX, 1>; + WriteVecIMul, 1>; defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw", int_x86_ssse3_pmul_hr_sw, - SchedWriteVecIMul.MMX, 1>; + WriteVecIMul, 1>; // -- Miscellanea defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, - SchedWriteVecIMul.MMX, 1>; + WriteVecIMul, 1>; defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw", - int_x86_ssse3_pmadd_ub_sw, - SchedWriteVecIMul.MMX>; + int_x86_ssse3_pmadd_ub_sw, WriteVecIMul>; let Predicates = [HasSSE1] in { defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, - SchedWriteVecALU.MMX, 1>; + WriteVecALU, 1>; defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, - SchedWritePSADBW.MMX, 1>; + WritePSADBW, 1>; } defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d, - SchedWriteVecALU.MMX>; + WriteVecALU>; let Constraints = "$src1 = $dst" in - defm MMX_PALIGNR : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b, - SchedWriteShuffle.MMX>; + defm MMX_PALIGNR : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>; // Logical Instructions defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, - SchedWriteVecLogic.MMX, 1>; + WriteVecLogic, 1>; defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por, - SchedWriteVecLogic.MMX, 1>; + WriteVecLogic, 1>; defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, - SchedWriteVecLogic.MMX, 1>; + WriteVecLogic, 1>; defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, - SchedWriteVecLogic.MMX>; + WriteVecLogic>; // Shift Instructions defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", int_x86_mmx_psrl_w, int_x86_mmx_psrli_w, - SchedWriteVecShift.MMX, - SchedWriteVecShiftImm.MMX>; + WriteVecShift>; defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", int_x86_mmx_psrl_d, int_x86_mmx_psrli_d, - SchedWriteVecShift.MMX, - SchedWriteVecShiftImm.MMX>; + WriteVecShift>; defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", int_x86_mmx_psrl_q, int_x86_mmx_psrli_q, - SchedWriteVecShift.MMX, - SchedWriteVecShiftImm.MMX>; + WriteVecShift>; defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", int_x86_mmx_psll_w, int_x86_mmx_pslli_w, - SchedWriteVecShift.MMX, - SchedWriteVecShiftImm.MMX>; + WriteVecShift>; defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", int_x86_mmx_psll_d, int_x86_mmx_pslli_d, - SchedWriteVecShift.MMX, - SchedWriteVecShiftImm.MMX>; + WriteVecShift>; defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", int_x86_mmx_psll_q, int_x86_mmx_pslli_q, - SchedWriteVecShift.MMX, - SchedWriteVecShiftImm.MMX>; + WriteVecShift>; defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", int_x86_mmx_psra_w, int_x86_mmx_psrai_w, - SchedWriteVecShift.MMX, - SchedWriteVecShiftImm.MMX>; + WriteVecShift>; defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", int_x86_mmx_psra_d, int_x86_mmx_psrai_d, - SchedWriteVecShift.MMX, - SchedWriteVecShiftImm.MMX>; + WriteVecShift>; // Comparison Instructions defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w, - SchedWriteVecALU.MMX>; + WriteVecALU>; defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d, - SchedWriteVecALU.MMX>; + WriteVecALU>; // -- Unpack Instructions defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", int_x86_mmx_punpckhbw, - SchedWriteShuffle.MMX>; + WriteShuffle>; defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", int_x86_mmx_punpckhwd, - SchedWriteShuffle.MMX>; + WriteShuffle>; defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", int_x86_mmx_punpckhdq, - SchedWriteShuffle.MMX>; + WriteShuffle>; defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", int_x86_mmx_punpcklbw, - SchedWriteShuffle.MMX, + WriteShuffle, 0, i32mem>; defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", int_x86_mmx_punpcklwd, - SchedWriteShuffle.MMX, + WriteShuffle, 0, i32mem>; defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq", int_x86_mmx_punpckldq, - SchedWriteShuffle.MMX, + WriteShuffle, 0, i32mem>; // -- Pack Instructions defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb, - SchedWriteShuffle.MMX>; + WriteShuffle>; defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw, - SchedWriteShuffle.MMX>; + WriteShuffle>; defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb, - SchedWriteShuffle.MMX>; + WriteShuffle>; // -- Shuffle Instructions defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b, - SchedWriteVarShuffle.MMX>; + WriteVarShuffle>; def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>, - Sched<[SchedWriteShuffle.MMX]>; + Sched<[WriteShuffle]>; def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, (int_x86_sse_pshuf_w (load_mmx addr:$src1), imm:$src2))]>, - Sched<[SchedWriteShuffle.MMX.Folded]>; + Sched<[WriteShuffleLd]>; // -- Conversion Instructions defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi, @@ -582,7 +570,7 @@ (x86mmx (MMX_MOVQ64rm addr:$src))>; // Misc. -let SchedRW = [SchedWriteShuffle.MMX] in { +let SchedRW = [WriteShuffle] in { let Uses = [EDI], Predicates = [HasSSE1,Not64BitMode] in def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), "maskmovq\t{$mask, $src|$src, $mask}", Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -3396,7 +3396,6 @@ string OpcodeStr, SDNode OpNode, SDNode OpNode2, RegisterClass RC, X86FoldableSchedWrite sched, - X86FoldableSchedWrite schedImm, ValueType DstVT, ValueType SrcVT, PatFrag ld_frag, bit Is2Addr = 1> { // src2 is always 128-bit @@ -3421,28 +3420,25 @@ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>, - Sched<[schedImm]>; + Sched<[sched]>; } multiclass PDI_binop_rmi_all opc, bits<8> opc2, Format ImmForm, string OpcodeStr, SDNode OpNode, SDNode OpNode2, ValueType DstVT128, ValueType DstVT256, ValueType SrcVT, - X86SchedWriteWidths sched, - X86SchedWriteWidths schedImm, Predicate prd> { + X86SchedWriteWidths sched, Predicate prd> { let Predicates = [HasAVX, prd] in defm V#NAME : PDI_binop_rmi, VEX_4V, VEX_WIG; + OpNode, OpNode2, VR128, sched.XMM, DstVT128, + SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, prd] in defm V#NAME#Y : PDI_binop_rmi, VEX_4V, VEX_L, - VEX_WIG; + OpNode, OpNode2, VR256, sched.YMM, DstVT256, + SrcVT, loadv2i64, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rmi; + VR128, sched.XMM, DstVT128, SrcVT, memopv2i64>; } multiclass PDI_binop_ri opc, Format ImmForm, string OpcodeStr, @@ -3473,30 +3469,25 @@ let ExeDomain = SSEPackedInt in { defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, v8i16, v16i16, v8i16, SchedWriteVecShift, - SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; + NoVLX_Or_NoBWI>; defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, - v4i32, v8i32, v4i32, SchedWriteVecShift, - SchedWriteVecShiftImm, NoVLX>; + v4i32, v8i32, v4i32, SchedWriteVecShift, NoVLX>; defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, - v2i64, v4i64, v2i64, SchedWriteVecShift, - SchedWriteVecShiftImm, NoVLX>; + v2i64, v4i64, v2i64, SchedWriteVecShift, NoVLX>; defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, v8i16, v16i16, v8i16, SchedWriteVecShift, - SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; + NoVLX_Or_NoBWI>; defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, - v4i32, v8i32, v4i32, SchedWriteVecShift, - SchedWriteVecShiftImm, NoVLX>; + v4i32, v8i32, v4i32, SchedWriteVecShift, NoVLX>; defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, - v2i64, v4i64, v2i64, SchedWriteVecShift, - SchedWriteVecShiftImm, NoVLX>; + v2i64, v4i64, v2i64, SchedWriteVecShift, NoVLX>; defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, v8i16, v16i16, v8i16, SchedWriteVecShift, - SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; + NoVLX_Or_NoBWI>; defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, - v4i32, v8i32, v4i32, SchedWriteVecShift, - SchedWriteVecShiftImm, NoVLX>; + v4i32, v8i32, v4i32, SchedWriteVecShift, NoVLX>; defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq, SchedWriteShuffle>; @@ -4693,16 +4684,16 @@ let isCommutable = 0 in { defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, loadv2i64, i128mem, - SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; + WritePHAdd, 0>, VEX_4V, VEX_WIG; defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, loadv2i64, i128mem, - SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; + WritePHAdd, 0>, VEX_4V, VEX_WIG; defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, loadv2i64, i128mem, - SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; + WritePHAdd, 0>, VEX_4V, VEX_WIG; defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, loadv2i64, i128mem, - SchedWritePHAdd.XMM, 0>, VEX_4V; + WritePHAdd, 0>, VEX_4V; defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", int_x86_ssse3_psign_b_128, SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; @@ -4714,10 +4705,10 @@ SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", int_x86_ssse3_phadd_sw_128, - SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + WritePHAdd, loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", int_x86_ssse3_phsub_sw_128, - SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + WritePHAdd, loadv2i64, 0>, VEX_4V, VEX_WIG; } } @@ -4739,16 +4730,16 @@ let isCommutable = 0 in { defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, VR256, loadv4i64, i256mem, - SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; + WritePHAdd, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, loadv4i64, i256mem, - SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; + WritePHAdd, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, VR256, loadv4i64, i256mem, - SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; + WritePHAdd, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, loadv4i64, i256mem, - SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L; + WritePHAdd, 0>, VEX_4V, VEX_L; defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, @@ -4757,10 +4748,10 @@ SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", int_x86_avx2_phadd_sw, - SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; + WritePHAdd>, VEX_4V, VEX_L, VEX_WIG; defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", int_x86_avx2_phsub_sw, - SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; + WritePHAdd>, VEX_4V, VEX_L, VEX_WIG; } } @@ -4768,13 +4759,13 @@ let ImmT = NoImm, Constraints = "$src1 = $dst" in { let isCommutable = 0 in { defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, - memopv2i64, i128mem, SchedWritePHAdd.XMM>; + memopv2i64, i128mem, WritePHAdd>; defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, - memopv2i64, i128mem, SchedWritePHAdd.XMM>; + memopv2i64, i128mem, WritePHAdd>; defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, - memopv2i64, i128mem, SchedWritePHAdd.XMM>; + memopv2i64, i128mem, WritePHAdd>; defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, - memopv2i64, i128mem, SchedWritePHAdd.XMM>; + memopv2i64, i128mem, WritePHAdd>; defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, SchedWriteVecALU.XMM, memopv2i64>; defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, @@ -4785,10 +4776,10 @@ memopv2i64, i128mem, SchedWriteVarShuffle.XMM>; defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", int_x86_ssse3_phadd_sw_128, - SchedWritePHAdd.XMM, memopv2i64>; + WritePHAdd, memopv2i64>; defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", int_x86_ssse3_phsub_sw_128, - SchedWritePHAdd.XMM, memopv2i64>; + WritePHAdd, memopv2i64>; defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, v16i8, VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>; @@ -6026,15 +6017,15 @@ let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, VR128, loadv4f32, f128mem, 0, - SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG; + SchedWriteFAdd.XMM>, VEX_4V, VEX_WIG; let ExeDomain = SSEPackedDouble in defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, VR128, loadv2f64, f128mem, 0, - SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG; + SchedWriteFAdd.XMM>, VEX_4V, VEX_WIG; let ExeDomain = SSEPackedSingle in defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, VR256, loadv8f32, i256mem, 0, - SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG; + SchedWriteFAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; } let Predicates = [HasAVX2] in { @@ -6055,11 +6046,11 @@ let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, VR128, memopv4f32, f128mem, 1, - SchedWriteDPPS.XMM>; + SchedWriteFAdd.XMM>; let ExeDomain = SSEPackedDouble in defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, VR128, memopv2f64, f128mem, 1, - SchedWriteDPPD.XMM>; + SchedWriteFAdd.XMM>; } /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate Index: lib/Target/X86/X86InstrXOP.td =================================================================== --- lib/Target/X86/X86InstrXOP.td +++ lib/Target/X86/X86InstrXOP.td @@ -18,7 +18,7 @@ def rm : IXOP, XOP, - Sched<[WritePHAdd.Folded, ReadAfterLd]>; + Sched<[WritePHAddLd, ReadAfterLd]>; } let ExeDomain = SSEPackedInt in { @@ -41,128 +41,119 @@ // Scalar load 2 addr operand instructions multiclass xop2opsld opc, string OpcodeStr, Intrinsic Int, - Operand memop, ComplexPattern mem_cpat, - X86FoldableSchedWrite sched> { + Operand memop, ComplexPattern mem_cpat> { def rr : IXOP, XOP, Sched<[sched]>; + [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WriteFAdd]>; def rm : IXOP, XOP, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[WriteFAddLd, ReadAfterLd]>; } multiclass xop2op128 opc, string OpcodeStr, Intrinsic Int, - PatFrag memop, X86FoldableSchedWrite sched> { + PatFrag memop> { def rr : IXOP, XOP, Sched<[sched]>; + [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WriteFAdd]>; def rm : IXOP, XOP, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[WriteFAddLd, ReadAfterLd]>; } multiclass xop2op256 opc, string OpcodeStr, Intrinsic Int, - PatFrag memop, X86FoldableSchedWrite sched> { + PatFrag memop> { def Yrr : IXOP, XOP, VEX_L, Sched<[sched]>; + [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L, Sched<[WriteFAddY]>; def Yrm : IXOP, XOP, VEX_L, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[WriteFAddYLd, ReadAfterLd]>; } let ExeDomain = SSEPackedSingle in { defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, - ssmem, sse_load_f32, SchedWriteFAdd.XMM>; - defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32, - SchedWriteFAdd.XMM>; - defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32, - SchedWriteFAdd.YMM>; + ssmem, sse_load_f32>; + defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>; + defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>; } let ExeDomain = SSEPackedDouble in { defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, - sdmem, sse_load_f64, SchedWriteFAdd.XMM>; - defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64, - SchedWriteFAdd.XMM>; - defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64, - SchedWriteFAdd.YMM>; + sdmem, sse_load_f64>; + defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>; + defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>; } multiclass xop3op opc, string OpcodeStr, SDNode OpNode, - ValueType vt128, X86FoldableSchedWrite sched> { + ValueType vt128> { def rr : IXOP, - XOP, Sched<[sched]>; + XOP, Sched<[WriteVarVecShift]>; def rm : IXOP, - XOP_4V, VEX_W, Sched<[sched.Folded, ReadAfterLd]>; + XOP_4V, VEX_W, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; def mr : IXOP, - XOP, Sched<[sched.Folded, ReadAfterLd]>; + XOP, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : IXOP, - XOP_4V, VEX_W, Sched<[sched]>, FoldGenData; + XOP_4V, VEX_W, Sched<[WriteVarVecShift]>, FoldGenData; } let ExeDomain = SSEPackedInt in { - defm VPROTB : xop3op<0x90, "vprotb", rotl, v16i8, SchedWriteVarVecShift.XMM>; - defm VPROTD : xop3op<0x92, "vprotd", rotl, v4i32, SchedWriteVarVecShift.XMM>; - defm VPROTQ : xop3op<0x93, "vprotq", rotl, v2i64, SchedWriteVarVecShift.XMM>; - defm VPROTW : xop3op<0x91, "vprotw", rotl, v8i16, SchedWriteVarVecShift.XMM>; - defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8, SchedWriteVarVecShift.XMM>; - defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32, SchedWriteVarVecShift.XMM>; - defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64, SchedWriteVarVecShift.XMM>; - defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16, SchedWriteVarVecShift.XMM>; - defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8, SchedWriteVarVecShift.XMM>; - defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32, SchedWriteVarVecShift.XMM>; - defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64, SchedWriteVarVecShift.XMM>; - defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16, SchedWriteVarVecShift.XMM>; + defm VPROTB : xop3op<0x90, "vprotb", rotl, v16i8>; + defm VPROTD : xop3op<0x92, "vprotd", rotl, v4i32>; + defm VPROTQ : xop3op<0x93, "vprotq", rotl, v2i64>; + defm VPROTW : xop3op<0x91, "vprotw", rotl, v8i16>; + defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>; + defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>; + defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>; + defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>; + defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>; + defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>; + defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>; + defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>; } multiclass xop3opimm opc, string OpcodeStr, SDNode OpNode, - ValueType vt128, X86FoldableSchedWrite sched> { + ValueType vt128> { def ri : IXOPi8, - XOP, Sched<[sched]>; + XOP, Sched<[WriteVecShift]>; def mi : IXOPi8, - XOP, Sched<[sched.Folded, ReadAfterLd]>; + XOP, Sched<[WriteVecShiftLd, ReadAfterLd]>; } let ExeDomain = SSEPackedInt in { - defm VPROTB : xop3opimm<0xC0, "vprotb", X86vrotli, v16i8, - SchedWriteVecShiftImm.XMM>; - defm VPROTD : xop3opimm<0xC2, "vprotd", X86vrotli, v4i32, - SchedWriteVecShiftImm.XMM>; - defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vrotli, v2i64, - SchedWriteVecShiftImm.XMM>; - defm VPROTW : xop3opimm<0xC1, "vprotw", X86vrotli, v8i16, - SchedWriteVecShiftImm.XMM>; + defm VPROTB : xop3opimm<0xC0, "vprotb", X86vrotli, v16i8>; + defm VPROTD : xop3opimm<0xC2, "vprotd", X86vrotli, v4i32>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vrotli, v2i64>; + defm VPROTW : xop3opimm<0xC1, "vprotw", X86vrotli, v8i16>; } // Instruction where second source can be memory, but third must be register @@ -187,29 +178,29 @@ let ExeDomain = SSEPackedInt in { defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", - int_x86_xop_vpmadcswd, SchedWriteVecIMul.XMM>; + int_x86_xop_vpmadcswd, WriteVecIMul>; defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", - int_x86_xop_vpmadcsswd, SchedWriteVecIMul.XMM>; + int_x86_xop_vpmadcsswd, WriteVecIMul>; defm VPMACSWW : xop4opm2<0x95, "vpmacsww", - int_x86_xop_vpmacsww, SchedWriteVecIMul.XMM>; + int_x86_xop_vpmacsww, WriteVecIMul>; defm VPMACSWD : xop4opm2<0x96, "vpmacswd", - int_x86_xop_vpmacswd, SchedWriteVecIMul.XMM>; + int_x86_xop_vpmacswd, WriteVecIMul>; defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", - int_x86_xop_vpmacssww, SchedWriteVecIMul.XMM>; + int_x86_xop_vpmacssww, WriteVecIMul>; defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", - int_x86_xop_vpmacsswd, SchedWriteVecIMul.XMM>; + int_x86_xop_vpmacsswd, WriteVecIMul>; defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", - int_x86_xop_vpmacssdql, SchedWritePMULLD.XMM>; + int_x86_xop_vpmacssdql, WritePMULLD>; defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", - int_x86_xop_vpmacssdqh, SchedWritePMULLD.XMM>; + int_x86_xop_vpmacssdqh, WritePMULLD>; defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", - int_x86_xop_vpmacssdd, SchedWritePMULLD.XMM>; + int_x86_xop_vpmacssdd, WritePMULLD>; defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", - int_x86_xop_vpmacsdql, SchedWritePMULLD.XMM>; + int_x86_xop_vpmacsdql, WritePMULLD>; defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", - int_x86_xop_vpmacsdqh, SchedWritePMULLD.XMM>; + int_x86_xop_vpmacsdqh, WritePMULLD>; defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", - int_x86_xop_vpmacsdd, SchedWritePMULLD.XMM>; + int_x86_xop_vpmacsdd, WritePMULLD>; } // IFMA patterns - for cases where we can safely ignore the overflow bits from @@ -242,8 +233,7 @@ }]>; // Instruction where second source can be memory, third must be imm8 -multiclass xopvpcom opc, string Suffix, SDNode OpNode, ValueType vt128, - X86FoldableSchedWrite sched> { +multiclass xopvpcom opc, string Suffix, SDNode OpNode, ValueType vt128> { let ExeDomain = SSEPackedInt in { // SSE integer instructions let isCommutable = 1 in def ri : IXOPi8, - XOP_4V, Sched<[sched]>; + XOP_4V, Sched<[WriteVecALU]>; def mi : IXOPi8, - XOP_4V, Sched<[sched.Folded, ReadAfterLd]>; + XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>; let isAsmParserOnly = 1, hasSideEffects = 0 in { def ri_alt : IXOPi8, XOP_4V, Sched<[sched]>; + []>, XOP_4V, Sched<[WriteVecALU]>; let mayLoad = 1 in def mi_alt : IXOPi8, XOP_4V, Sched<[sched.Folded, ReadAfterLd]>; + []>, XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>; } } @@ -284,17 +274,17 @@ (CommuteVPCOMCC imm:$cc))>; } -defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8, SchedWriteVecALU.XMM>; -defm VPCOMW : xopvpcom<0xCD, "w", X86vpcom, v8i16, SchedWriteVecALU.XMM>; -defm VPCOMD : xopvpcom<0xCE, "d", X86vpcom, v4i32, SchedWriteVecALU.XMM>; -defm VPCOMQ : xopvpcom<0xCF, "q", X86vpcom, v2i64, SchedWriteVecALU.XMM>; -defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8, SchedWriteVecALU.XMM>; -defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16, SchedWriteVecALU.XMM>; -defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32, SchedWriteVecALU.XMM>; -defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64, SchedWriteVecALU.XMM>; +defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8>; +defm VPCOMW : xopvpcom<0xCD, "w", X86vpcom, v8i16>; +defm VPCOMD : xopvpcom<0xCE, "d", X86vpcom, v4i32>; +defm VPCOMQ : xopvpcom<0xCF, "q", X86vpcom, v2i64>; +defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8>; +defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16>; +defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32>; +defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>; multiclass xop4op opc, string OpcodeStr, SDNode OpNode, - ValueType vt128, X86FoldableSchedWrite sched> { + ValueType vt128> { def rrr : IXOPi8Reg, - XOP_4V, Sched<[sched]>; + XOP_4V, Sched<[WriteVarShuffle]>; def rrm : IXOPi8Reg, - XOP_4V, VEX_W, Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; + XOP_4V, VEX_W, Sched<[WriteVarShuffleLd, ReadAfterLd, ReadAfterLd]>; def rmr : IXOPi8Reg, - XOP_4V, Sched<[sched.Folded, ReadAfterLd, + XOP_4V, Sched<[WriteVarShuffleLd, ReadAfterLd, // 128mem:$src2 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, @@ -330,39 +320,37 @@ (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, VEX_W, Sched<[sched]>, FoldGenData; + []>, XOP_4V, VEX_W, Sched<[WriteVarShuffle]>, FoldGenData; } let ExeDomain = SSEPackedInt in { - defm VPPERM : xop4op<0xA3, "vpperm", X86vpperm, v16i8, - SchedWriteVarShuffle.XMM>; + defm VPPERM : xop4op<0xA3, "vpperm", X86vpperm, v16i8>; } // Instruction where either second or third source can be memory multiclass xop4op_int opc, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop, ValueType VT, - X86FoldableSchedWrite sched> { + X86MemOperand x86memop, ValueType VT> { def rrr : IXOPi8Reg, XOP_4V, - Sched<[sched]>; + Sched<[WriteShuffle]>; def rrm : IXOPi8Reg, - XOP_4V, VEX_W, Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; + XOP_4V, VEX_W, Sched<[WriteShuffleLd, ReadAfterLd, ReadAfterLd]>; def rmr : IXOPi8Reg, - XOP_4V, Sched<[sched.Folded, ReadAfterLd, + XOP_4V, Sched<[WriteShuffleLd, ReadAfterLd, // x86memop:$src2 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, @@ -374,27 +362,25 @@ (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, VEX_W, Sched<[sched]>, FoldGenData; + []>, XOP_4V, VEX_W, Sched<[WriteShuffle]>, FoldGenData; } let ExeDomain = SSEPackedInt in { - defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64, - SchedWriteShuffle.XMM>; - defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64, - SchedWriteShuffle.YMM>, VEX_L; + defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64>; + defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64>, VEX_L; } multiclass xop_vpermil2 Opc, string OpcodeStr, RegisterClass RC, X86MemOperand intmemop, X86MemOperand fpmemop, - ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag, - X86FoldableSchedWrite sched> { + ValueType VT, PatFrag FPLdFrag, + PatFrag IntLdFrag> { def rr : IXOP5, - Sched<[sched]>; + Sched<[WriteFVarShuffle]>; def rm : IXOP5, VEX_W, - Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; + Sched<[WriteFVarShuffleLd, ReadAfterLd, ReadAfterLd]>; def mr : IXOP5, - Sched<[sched.Folded, ReadAfterLd, + Sched<[WriteFVarShuffleLd, ReadAfterLd, // fpmemop:$src2 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, // RC:$src3 @@ -422,24 +408,20 @@ (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), - []>, VEX_W, Sched<[sched]>, FoldGenData; + []>, VEX_W, Sched<[WriteFVarShuffle]>, FoldGenData; } let ExeDomain = SSEPackedDouble in { defm VPERMIL2PD : xop_vpermil2<0x49, "vpermil2pd", VR128, i128mem, f128mem, - v2f64, loadv2f64, loadv2i64, - SchedWriteFVarShuffle.XMM>; + v2f64, loadv2f64, loadv2i64>; defm VPERMIL2PDY : xop_vpermil2<0x49, "vpermil2pd", VR256, i256mem, f256mem, - v4f64, loadv4f64, loadv4i64, - SchedWriteFVarShuffle.YMM>, VEX_L; + v4f64, loadv4f64, loadv4i64>, VEX_L; } let ExeDomain = SSEPackedSingle in { defm VPERMIL2PS : xop_vpermil2<0x48, "vpermil2ps", VR128, i128mem, f128mem, - v4f32, loadv4f32, loadv2i64, - SchedWriteFVarShuffle.XMM>; + v4f32, loadv4f32, loadv2i64>; defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem, - v8f32, loadv8f32, loadv4i64, - SchedWriteFVarShuffle.YMM>, VEX_L; + v8f32, loadv8f32, loadv4i64>, VEX_L; } Index: lib/Target/X86/X86SchedBroadwell.td =================================================================== --- lib/Target/X86/X86SchedBroadwell.td +++ lib/Target/X86/X86SchedBroadwell.td @@ -11,7 +11,6 @@ // scheduling and other instruction cost heuristics. // //===----------------------------------------------------------------------===// - def BroadwellModel : SchedMachineModel { // All x86 instructions are modeled as a single micro-op, and BW can decode 4 // instructions per cycle. @@ -156,12 +155,12 @@ def : WriteRes; defm : BWWriteResPair; // Floating point add/sub. -defm : BWWriteResPair; // Floating point add/sub (YMM/ZMM). +defm : BWWriteResPair; // Floating point add/sub (YMM/ZMM). defm : BWWriteResPair; // Floating point compare. -defm : BWWriteResPair; // Floating point compare (YMM/ZMM). +defm : BWWriteResPair; // Floating point compare (YMM/ZMM). defm : BWWriteResPair; // Floating point compare to flags. -defm : BWWriteResPair; // Floating point multiplication. -defm : BWWriteResPair; // Floating point multiplication (YMM/ZMM). +defm : BWWriteResPair; // Floating point multiplication. +defm : BWWriteResPair; // Floating point multiplication (YMM/ZMM). defm : BWWriteResPair; // 10-14 cycles. // Floating point division. defm : BWWriteResPair; // 10-14 cycles. // Floating point division (YMM/ZMM). defm : BWWriteResPair; // Floating point square root. @@ -173,9 +172,6 @@ defm : BWWriteResPair; // Fused Multiply Add. defm : BWWriteResPair; // Fused Multiply Add (Scalar). defm : BWWriteResPair; // Fused Multiply Add (YMM/ZMM). -defm : BWWriteResPair; // Floating point double dot product. -defm : BWWriteResPair; // Floating point single dot product. -defm : BWWriteResPair; // Floating point single dot product (YMM). defm : BWWriteResPair; // Floating point fabs/fchs. defm : BWWriteResPair; // Floating point and/or/xor logicals. defm : BWWriteResPair; // Floating point and/or/xor logicals (YMM/ZMM). @@ -202,38 +198,19 @@ def : WriteRes; def : WriteRes; -defm : BWWriteResPair; // Vector integer ALU op, no logicals. -defm : BWWriteResPair; // Vector integer ALU op, no logicals (YMM/ZMM). +defm : BWWriteResPair; // Vector integer ALU op, no logicals. defm : BWWriteResPair; // Vector integer and/or/xor. defm : BWWriteResPair; // Vector integer and/or/xor (YMM/ZMM). -defm : BWWriteResPair; // Vector integer multiply. -defm : BWWriteResPair; // Vector integer multiply. -defm : BWWriteResPair; // Vector PMULLD. -defm : BWWriteResPair; // Vector PMULLD (YMM/ZMM). -defm : BWWriteResPair; // Vector shuffles. -defm : BWWriteResPair; // Vector shuffles (YMM/ZMM). -defm : BWWriteResPair; // Vector variable shuffles. -defm : BWWriteResPair; // Vector variable shuffles (YMM/ZMM). -defm : BWWriteResPair; // Vector blends. -defm : BWWriteResPair; // Vector blends (YMM/ZMM). +defm : BWWriteResPair; // Vector integer shifts. +defm : BWWriteResPair; // Vector integer multiply. +defm : BWWriteResPair; // PMULLD +defm : BWWriteResPair; // Vector shuffles. +defm : BWWriteResPair; // Vector variable shuffles. +defm : BWWriteResPair; // Vector blends. defm : BWWriteResPair; // Vector variable blends. -defm : BWWriteResPair; // Vector variable blends (YMM/ZMM). defm : BWWriteResPair; // Vector MPSAD. -defm : BWWriteResPair; // Vector MPSAD. -defm : BWWriteResPair; // Vector PSADBW. -defm : BWWriteResPair; // Vector PSADBW (YMM/ZMM). -defm : BWWriteResPair; // Vector PHMINPOS. - -// Vector integer shifts. -defm : BWWriteResPair; -defm : BWWriteResPair; -defm : X86WriteRes; -defm : X86WriteRes; - -defm : BWWriteResPair; // Vector integer immediate shifts (XMM). -defm : BWWriteResPair; // Vector integer immediate shifts (YMM/ZMM). -defm : BWWriteResPair; // Variable vector shifts. -defm : BWWriteResPair; // Variable vector shifts (YMM/ZMM). +defm : BWWriteResPair; // Vector PSADBW. +defm : BWWriteResPair; // Vector PHMINPOS. // Vector insert/extract operations. def : WriteRes { @@ -356,10 +333,11 @@ def : WriteRes { let Latency = 100; } // def WriteSystem : SchedWrite; // AVX2. -defm : BWWriteResPair; // Fp 256-bit width vector shuffles. -defm : BWWriteResPair; // Fp 256-bit width vector variable shuffles. -defm : BWWriteResPair; // 256-bit width vector shuffles. -defm : BWWriteResPair; // 256-bit width vector variable shuffles. +defm : BWWriteResPair; // Fp 256-bit width vector shuffles. +defm : BWWriteResPair; // Fp 256-bit width vector variable shuffles. +defm : BWWriteResPair; // 256-bit width vector shuffles. +defm : BWWriteResPair; // 256-bit width vector variable shuffles. +defm : BWWriteResPair; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -378,10 +356,9 @@ // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -defm : BWWriteResPair; +defm : BWWriteResPair; defm : BWWriteResPair; -defm : BWWriteResPair; -defm : BWWriteResPair; +defm : BWWriteResPair; // Remaining instrs. @@ -557,6 +534,14 @@ def: InstRW<[BWWriteResGroup15], (instregex "VCVTPH2PS(Y?)rr", "(V?)CVTPS2PDrr", "(V?)CVTSS2SDrr", + "(V?)PSLLDrr", + "(V?)PSLLQrr", + "(V?)PSLLWrr", + "(V?)PSRADrr", + "(V?)PSRAWrr", + "(V?)PSRLDrr", + "(V?)PSRLQrr", + "(V?)PSRLWrr", "(V?)PTESTrr")>; def BWWriteResGroup16 : SchedWriteRes<[BWPort6,BWPort0156]> { @@ -671,6 +656,16 @@ "VPMOVZXWDYrr", "VPMOVZXWQYrr")>; +def BWWriteResGroup29 : SchedWriteRes<[BWPort01]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup29], (instregex "(V?)MULPD(Y?)rr", + "(V?)MULPS(Y?)rr", + "(V?)MULSDrr", + "(V?)MULSSrr")>; + def BWWriteResGroup30 : SchedWriteRes<[BWPort0156]> { let Latency = 2; let NumMicroOps = 3; @@ -680,6 +675,15 @@ XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr, XCHG16ar, XCHG32ar, XCHG64ar)>; +def BWWriteResGroup31 : SchedWriteRes<[BWPort0,BWPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup31], (instregex "VPSLLVD(Y?)rr", + "VPSRAVD(Y?)rr", + "VPSRLVD(Y?)rr")>; + def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> { let Latency = 3; let NumMicroOps = 3; @@ -752,6 +756,14 @@ let ResourceCycles = [1,1]; } def: InstRW<[BWWriteResGroup40], (instregex "VCVTPS2PDYrr", + "VPSLLDYrr", + "VPSLLQYrr", + "VPSLLWYrr", + "VPSRADYrr", + "VPSRAWYrr", + "VPSRLDYrr", + "VPSRLQYrr", + "VPSRLWYrr", "VPTESTYrr")>; def BWWriteResGroup41 : SchedWriteRes<[BWPort0,BWPort0156]> { @@ -823,10 +835,7 @@ let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[BWWriteResGroup47], (instregex "(V?)PCMPGTQ(Y?)rr", - "MUL_FPrST0", - "MUL_FST0r", - "MUL_FrST0")>; +def: InstRW<[BWWriteResGroup47], (instregex "(V?)PCMPGTQ(Y?)rr")>; def BWWriteResGroup49 : SchedWriteRes<[BWPort23]> { let Latency = 5; @@ -1053,8 +1062,16 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[BWWriteResGroup73], (instregex "VPSLLVQYrm", +def: InstRW<[BWWriteResGroup73], (instregex "VPSLLDYrm", + "VPSLLQYrm", + "VPSLLVQYrm", + "VPSLLWYrm", + "VPSRADYrm", + "VPSRAWYrm", + "VPSRLDYrm", + "VPSRLQYrm", "VPSRLVQYrm", + "VPSRLWYrm", "VTESTPDYrm", "VTESTPSYrm")>; @@ -1068,6 +1085,79 @@ "FCOMP32m", "FCOMP64m")>; +def BWWriteResGroup75 : SchedWriteRes<[BWPort5,BWPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup75], (instregex "VPACKSSDWYrm", + "VPACKSSWBYrm", + "VPACKUSDWYrm", + "VPACKUSWBYrm", + "VPALIGNRYrmi", + "VPBLENDWYrmi", + "VPSHUFBYrm", + "VPSHUFDYmi", + "VPSHUFHWYmi", + "VPSHUFLWYmi", + "VPUNPCKHBWYrm", + "VPUNPCKHDQYrm", + "VPUNPCKHQDQYrm", + "VPUNPCKHWDYrm", + "VPUNPCKLBWYrm", + "VPUNPCKLDQYrm", + "VPUNPCKLQDQYrm", + "VPUNPCKLWDYrm")>; + +def BWWriteResGroup76 : SchedWriteRes<[BWPort23,BWPort15]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup76], (instregex "VPABSBYrm", + "VPABSDYrm", + "VPABSWYrm", + "VPADDBYrm", + "VPADDDYrm", + "VPADDQYrm", + "VPADDSBYrm", + "VPADDSWYrm", + "VPADDUSBYrm", + "VPADDUSWYrm", + "VPADDWYrm", + "VPAVGBYrm", + "VPAVGWYrm", + "VPCMPEQBYrm", + "VPCMPEQDYrm", + "VPCMPEQQYrm", + "VPCMPEQWYrm", + "VPCMPGTBYrm", + "VPCMPGTDYrm", + "VPCMPGTWYrm", + "VPMAXSBYrm", + "VPMAXSDYrm", + "VPMAXSWYrm", + "VPMAXUBYrm", + "VPMAXUDYrm", + "VPMAXUWYrm", + "VPMINSBYrm", + "VPMINSDYrm", + "VPMINSWYrm", + "VPMINUBYrm", + "VPMINUDYrm", + "VPMINUWYrm", + "VPSIGNBYrm", + "VPSIGNDYrm", + "VPSIGNWYrm", + "VPSUBBYrm", + "VPSUBDYrm", + "VPSUBQYrm", + "VPSUBSBYrm", + "VPSUBSWYrm", + "VPSUBUSBYrm", + "VPSUBUSWYrm", + "VPSUBWYrm")>; + def BWWriteResGroup77 : SchedWriteRes<[BWPort23,BWPort015]> { let Latency = 7; let NumMicroOps = 2; @@ -1101,7 +1191,15 @@ let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } -def: InstRW<[BWWriteResGroup81], (instregex "(V?)PTESTrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "(V?)PSLLDrm", + "(V?)PSLLQrm", + "(V?)PSLLWrm", + "(V?)PSRADrm", + "(V?)PSRAWrm", + "(V?)PSRLDrm", + "(V?)PSRLQrm", + "(V?)PSRLWrm", + "(V?)PTESTrm")>; def BWWriteResGroup82 : SchedWriteRes<[BWPort0,BWPort01,BWPort23]> { let Latency = 7; @@ -1194,6 +1292,16 @@ "VPMOVSXWQYrm", "VPMOVZXWDYrm")>; +def BWWriteResGroup93 : SchedWriteRes<[BWPort01,BWPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup93], (instregex "(V?)MULPDrm", + "(V?)MULPSrm", + "(V?)MULSDrm", + "(V?)MULSSrm")>; + def BWWriteResGroup94 : SchedWriteRes<[BWPort5,BWPort23]> { let Latency = 8; let NumMicroOps = 3; @@ -1201,9 +1309,19 @@ } def: InstRW<[BWWriteResGroup94], (instregex "VMASKMOVPDYrm", "VMASKMOVPSYrm", + "VPBLENDVBYrm", "VPMASKMOVDYrm", "VPMASKMOVQYrm")>; +def BWWriteResGroup95 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup95], (instregex "VPSLLVDrm", + "VPSRAVDrm", + "VPSRLVDrm")>; + def BWWriteResGroup97 : SchedWriteRes<[BWPort23,BWPort237,BWPort06,BWPort0156]> { let Latency = 8; let NumMicroOps = 5; @@ -1250,8 +1368,20 @@ } def: InstRW<[BWWriteResGroup101], (instregex "(ADD|SUB|SUBR)_F(32|64)m", "ILD_F(16|32|64)m", + "VADDPDYrm", + "VADDPSYrm", + "VADDSUBPDYrm", + "VADDSUBPSYrm", + "VCMPPDYrmi", + "VCMPPSYrmi", "VCVTPS2DQYrm", - "VCVTTPS2DQYrm")>; + "VCVTTPS2DQYrm", + "VMAX(C?)PDYrm", + "VMAX(C?)PSYrm", + "VMIN(C?)PDYrm", + "VMIN(C?)PSYrm", + "VSUBPDYrm", + "VSUBPSYrm")>; def BWWriteResGroup102 : SchedWriteRes<[BWPort5,BWPort23]> { let Latency = 9; @@ -1270,6 +1400,21 @@ "VPMOVZXDQYrm", "VPMOVZXWQYrm")>; +def BWWriteResGroup103 : SchedWriteRes<[BWPort01,BWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup103], (instregex "VMULPDYrm", + "VMULPSYrm")>; + +def BWWriteResGroup104 : SchedWriteRes<[BWPort0,BWPort1,BWPort5]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup104], (instregex "(V?)DPPDrri")>; + def BWWriteResGroup105 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> { let Latency = 9; let NumMicroOps = 3; @@ -1314,6 +1459,27 @@ def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTB(Y?)rm", "VPBROADCASTW(Y?)rm")>; +def BWWriteResGroup109 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup109], (instregex "VPSLLVDYrm", + "VPSRAVDYrm", + "VPSRLVDYrm")>; + +def BWWriteResGroup110 : SchedWriteRes<[BWPort5,BWPort23,BWPort15]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup110], (instregex "VPHADDDYrm", + "VPHADDSWYrm", + "VPHADDWYrm", + "VPHSUBDYrm", + "VPHSUBSWYrm", + "VPHSUBWYrm")>; + def BWWriteResGroup111 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort0156]> { let Latency = 9; let NumMicroOps = 4; @@ -1395,7 +1561,16 @@ let ResourceCycles = [1,1]; } def: InstRW<[BWWriteResGroup123], (instregex "MUL_F(32|64)m", - "VPCMPGTQYrm")>; + "VPCMPGTQYrm", + "VPMADDUBSWYrm", + "VPMADDWDYrm", + "VPMULDQYrm", + "VPMULHRSWYrm", + "VPMULHUWYrm", + "VPMULHWYrm", + "VPMULLWYrm", + "VPMULUDQYrm", + "VPSADBWYrm")>; def BWWriteResGroup126 : SchedWriteRes<[BWPort0,BWPort015]> { let Latency = 11; @@ -1476,6 +1651,13 @@ } def: InstRW<[BWWriteResGroup137_1], (instregex "(V?)SQRTSSr")>; +def BWWriteResGroup138 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[BWWriteResGroup138], (instregex "VMPSADBWYrmi")>; + def BWWriteResGroup139 : SchedWriteRes<[BWPort0,BWFPDivider]> { let Latency = 14; let NumMicroOps = 1; @@ -1497,6 +1679,20 @@ } def: InstRW<[BWWriteResGroup141], (instregex "MUL_FI(16|32)m")>; +def BWWriteResGroup142 : SchedWriteRes<[BWPort0,BWPort1,BWPort5]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup142], (instregex "(V?)DPPS(Y?)rri")>; + +def BWWriteResGroup143 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup143], (instregex "(V?)DPPDrmi")>; + def BWWriteResGroup144 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> { let Latency = 14; let NumMicroOps = 8; @@ -1542,6 +1738,13 @@ def: InstRW<[BWWriteResGroup150], (instregex "(V?)DIVPSrm", "(V?)DIVSSrm")>; +def BWWriteResGroup151 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 16; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup151], (instregex "VPMULLDYrm")>; + def BWWriteResGroup153 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> { let Latency = 16; let NumMicroOps = 14; @@ -1602,6 +1805,13 @@ def: InstRW<[BWWriteResGroup161], (instregex "(V?)DIVPDrm", "(V?)DIVSDrm")>; +def BWWriteResGroup163 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> { + let Latency = 19; + let NumMicroOps = 5; + let ResourceCycles = [2,1,1,1]; +} +def: InstRW<[BWWriteResGroup163], (instregex "(V?)DPPSrmi")>; + def BWWriteResGroup165 : SchedWriteRes<[BWPort0]> { let Latency = 20; let NumMicroOps = 1; @@ -1611,6 +1821,13 @@ "DIV_FST0r", "DIV_FrST0")>; +def BWWriteResGroup166 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> { + let Latency = 20; + let NumMicroOps = 5; + let ResourceCycles = [2,1,1,1]; +} +def: InstRW<[BWWriteResGroup166], (instregex "VDPPSYrmi")>; + def BWWriteResGroup167 : SchedWriteRes<[BWPort4,BWPort5,BWPort6,BWPort23,BWPort237,BWPort06,BWPort0156]> { let Latency = 20; let NumMicroOps = 8; Index: lib/Target/X86/X86SchedHaswell.td =================================================================== --- lib/Target/X86/X86SchedHaswell.td +++ lib/Target/X86/X86SchedHaswell.td @@ -153,8 +153,8 @@ defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; // 10-14 cycles. defm : HWWriteResPair; // 10-14 cycles. defm : HWWriteResPair; @@ -169,20 +169,17 @@ defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; @@ -197,41 +194,23 @@ def : WriteRes { let Latency = 5; } def : WriteRes; +defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; -// Vector integer shifts. -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : X86WriteRes; -defm : X86WriteRes; - -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; - // Vector insert/extract operations. def : WriteRes { let Latency = 2; @@ -242,7 +221,6 @@ let Latency = 6; let NumMicroOps = 2; } -def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>; def : WriteRes { let Latency = 2; @@ -621,8 +599,7 @@ defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; //=== Floating Point XMM and YMM Instructions ===// @@ -846,8 +823,16 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLVQYrm", +def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLDYrm", + "VPSLLQYrm", + "VPSLLVQYrm", + "VPSLLWYrm", + "VPSRADYrm", + "VPSRAWYrm", + "VPSRLDYrm", + "VPSRLQYrm", "VPSRLVQYrm", + "VPSRLWYrm", "VTESTPDYrm", "VTESTPSYrm")>; @@ -889,11 +874,15 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup13], (instregex "(V?)PACKSSDWrm", +def: InstRW<[HWWriteResGroup13], (instregex "(V?)INSERTPSrm", + "(V?)PACKSSDWrm", "(V?)PACKSSWBrm", "(V?)PACKUSDWrm", "(V?)PACKUSWBrm", "(V?)PALIGNRrmi", + "VPERMILPDmi", + "VPERMILPSmi", + "(V?)PSHUFBrm", "(V?)PSHUFDmi", "(V?)PSHUFHWmi", "(V?)PSHUFLWmi", @@ -904,23 +893,40 @@ "(V?)PUNPCKLBWrm", "(V?)PUNPCKLDQrm", "(V?)PUNPCKLQDQrm", - "(V?)PUNPCKLWDrm")>; + "(V?)PUNPCKLWDrm", + "(V?)SHUFPDrmi", + "(V?)SHUFPSrmi", + "(V?)UNPCKHPDrm", + "(V?)UNPCKHPSrm", + "(V?)UNPCKLPDrm", + "(V?)UNPCKLPSrm")>; def HWWriteResGroup13_1 : SchedWriteRes<[HWPort5,HWPort23]> { let Latency = 8; let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBDYrm", +def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKSSDWYrm", + "VPACKSSWBYrm", + "VPACKUSDWYrm", + "VPACKUSWBYrm", + "VPALIGNRYrmi", + "VPBLENDWYrmi", + "VPMOVSXBDYrm", "VPMOVSXBQYrm", - "VPMOVSXWQYrm")>; - -def HWWriteResGroup13_2 : SchedWriteRes<[HWPort5,HWPort23]> { - let Latency = 6; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PSHUFBrm")>; + "VPMOVSXWQYrm", + "VPSHUFBYrm", + "VPSHUFDYmi", + "VPSHUFHWYmi", + "VPSHUFLWYmi", + "VPUNPCKHBWYrm", + "VPUNPCKHDQYrm", + "VPUNPCKHQDQYrm", + "VPUNPCKHWDYrm", + "VPUNPCKLBWYrm", + "VPUNPCKLDQYrm", + "VPUNPCKLQDQYrm", + "VPUNPCKLWDYrm")>; def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> { let Latency = 6; @@ -946,14 +952,105 @@ "BLSI(32|64)rm", "BLSMSK(32|64)rm", "BLSR(32|64)rm", - "MOVBE(16|32|64)rm", - "MMX_PABS(B|D|W)rm", - "MMX_P(ADD|SUB)(B|D|W|Q)irm", - "MMX_P(ADD|SUB)(U?)S(B|W)irm", - "MMX_PAVG(B|W)irm", - "MMX_PCMP(EQ|GT)(B|D|W)irm", - "MMX_P(MAX|MIN)(SW|UB)irm", - "MMX_PSIGN(B|D|W)rm")>; + "MOVBE(16|32|64)rm")>; + +def HWWriteResGroup16_1 : SchedWriteRes<[HWPort23,HWPort15]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup16_1], (instregex "(V?)PABSBrm", + "(V?)PABSDrm", + "(V?)PABSWrm", + "(V?)PADDBrm", + "(V?)PADDDrm", + "(V?)PADDQrm", + "(V?)PADDSBrm", + "(V?)PADDSWrm", + "(V?)PADDUSBrm", + "(V?)PADDUSWrm", + "(V?)PADDWrm", + "(V?)PAVGBrm", + "(V?)PAVGWrm", + "(V?)PCMPEQBrm", + "(V?)PCMPEQDrm", + "(V?)PCMPEQQrm", + "(V?)PCMPEQWrm", + "(V?)PCMPGTBrm", + "(V?)PCMPGTDrm", + "(V?)PCMPGTWrm", + "(V?)PMAXSBrm", + "(V?)PMAXSDrm", + "(V?)PMAXSWrm", + "(V?)PMAXUBrm", + "(V?)PMAXUDrm", + "(V?)PMAXUWrm", + "(V?)PMINSBrm", + "(V?)PMINSDrm", + "(V?)PMINSWrm", + "(V?)PMINUBrm", + "(V?)PMINUDrm", + "(V?)PMINUWrm", + "(V?)PSIGNBrm", + "(V?)PSIGNDrm", + "(V?)PSIGNWrm", + "(V?)PSUBBrm", + "(V?)PSUBDrm", + "(V?)PSUBQrm", + "(V?)PSUBSBrm", + "(V?)PSUBSWrm", + "(V?)PSUBUSBrm", + "(V?)PSUBUSWrm", + "(V?)PSUBWrm")>; + +def HWWriteResGroup16_2 : SchedWriteRes<[HWPort23,HWPort15]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup16_2], (instregex "VPABSBYrm", + "VPABSDYrm", + "VPABSWYrm", + "VPADDBYrm", + "VPADDDYrm", + "VPADDQYrm", + "VPADDSBYrm", + "VPADDSWYrm", + "VPADDUSBYrm", + "VPADDUSWYrm", + "VPADDWYrm", + "VPAVGBYrm", + "VPAVGWYrm", + "VPCMPEQBYrm", + "VPCMPEQDYrm", + "VPCMPEQQYrm", + "VPCMPEQWYrm", + "VPCMPGTBYrm", + "VPCMPGTDYrm", + "VPCMPGTWYrm", + "VPMAXSBYrm", + "VPMAXSDYrm", + "VPMAXSWYrm", + "VPMAXUBYrm", + "VPMAXUDYrm", + "VPMAXUWYrm", + "VPMINSBYrm", + "VPMINSDYrm", + "VPMINSWYrm", + "VPMINUBYrm", + "VPMINUDYrm", + "VPMINUWYrm", + "VPSIGNBYrm", + "VPSIGNDYrm", + "VPSIGNWYrm", + "VPSUBBYrm", + "VPSUBDYrm", + "VPSUBQYrm", + "VPSUBSBYrm", + "VPSUBSWYrm", + "VPSUBUSBYrm", + "VPSUBUSWYrm", + "VPSUBWYrm")>; def HWWriteResGroup17 : SchedWriteRes<[HWPort23,HWPort015]> { let Latency = 7; @@ -1086,6 +1183,14 @@ "VCVTPH2PSrr", "(V?)CVTPS2PDrr", "(V?)CVTSS2SDrr", + "(V?)PSLLDrr", + "(V?)PSLLQrr", + "(V?)PSLLWrr", + "(V?)PSRADrr", + "(V?)PSRAWrr", + "(V?)PSRLDrr", + "(V?)PSRLQrr", + "(V?)PSRLWrr", "(V?)PTESTrr")>; def HWWriteResGroup32 : SchedWriteRes<[HWPort6,HWPort0156]> { @@ -1147,6 +1252,7 @@ } def: InstRW<[HWWriteResGroup36_1], (instregex "VMASKMOVPDYrm", "VMASKMOVPSYrm", + "VPBLENDVBYrm", "VPMASKMOVDYrm", "VPMASKMOVQYrm")>; @@ -1172,7 +1278,15 @@ let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } -def: InstRW<[HWWriteResGroup38], (instregex "(V?)PTESTrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "(V?)PSLLDrm", + "(V?)PSLLQrm", + "(V?)PSLLWrm", + "(V?)PSRADrm", + "(V?)PSRAWrm", + "(V?)PSRLDrm", + "(V?)PSRLQrm", + "(V?)PSRLWrm", + "(V?)PTESTrm")>; def HWWriteResGroup39 : SchedWriteRes<[HWPort0,HWPort01,HWPort23]> { let Latency = 7; @@ -1301,7 +1415,13 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBDYrm", +def: InstRW<[HWWriteResGroup53], (instregex "VPERM2F128rm", + "VPERM2I128rm", + "VPERMDYrm", + "VPERMPDYmi", + "VPERMPSYrm", + "VPERMQYmi", + "VPMOVZXBDYrm", "VPMOVZXBQYrm", "VPMOVZXBWYrm", "VPMOVZXDQYrm", @@ -1326,6 +1446,15 @@ XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr, XCHG16ar, XCHG32ar, XCHG64ar)>; +def HWWriteResGroup55 : SchedWriteRes<[HWPort0,HWPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVD(Y?)rr", + "VPSRAVD(Y?)rr", + "VPSRLVD(Y?)rr")>; + def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> { let Latency = 3; let NumMicroOps = 3; @@ -1379,6 +1508,24 @@ def: InstRW<[HWWriteResGroup62], (instregex "IST(T?)_FP(16|32|64)m", "IST_F(16|32)m")>; +def HWWriteResGroup63 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[HWWriteResGroup63], (instregex "VPSLLVDYrm", + "VPSRAVDYrm", + "VPSRLVDYrm")>; + +def HWWriteResGroup63_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[HWWriteResGroup63_1], (instregex "VPSLLVDrm", + "VPSRAVDrm", + "VPSRLVDrm")>; + def HWWriteResGroup64 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> { let Latency = 8; let NumMicroOps = 4; @@ -1386,6 +1533,18 @@ } def: InstRW<[HWWriteResGroup64], (instregex "MMX_PH(ADD|SUB)(D|SW|W)rm")>; +def HWWriteResGroup64_1 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[HWWriteResGroup64_1], (instregex "VPHADDDYrm", + "VPHADDSWYrm", + "VPHADDWYrm", + "VPHSUBDYrm", + "VPHSUBSWYrm", + "VPHSUBWYrm")>; + def HWWriteResGroup65 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> { let Latency = 8; let NumMicroOps = 4; @@ -1452,6 +1611,14 @@ let ResourceCycles = [1,1]; } def: InstRW<[HWWriteResGroup71], (instregex "VCVTPS2PDYrr", + "VPSLLDYrr", + "VPSLLQYrr", + "VPSLLWYrr", + "VPSRADYrr", + "VPSRAWYrr", + "VPSRLDYrr", + "VPSRLQYrr", + "VPSRLWYrr", "VPTESTYrr")>; def HWWriteResGroup72 : SchedWriteRes<[HWPort0,HWPort0156]> { @@ -1630,10 +1797,17 @@ let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[HWWriteResGroup89], (instregex "(V?)PCMPGTQ(Y?)rr", - "MUL_FPrST0", - "MUL_FST0r", - "MUL_FrST0")>; +def: InstRW<[HWWriteResGroup89], (instregex "(V?)PCMPGTQ(Y?)rr")>; + +def HWWriteResGroup90 : SchedWriteRes<[HWPort01]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup90], (instregex "(V?)MULPD(Y?)rr", + "(V?)MULPS(Y?)rr", + "(V?)MULSDrr", + "(V?)MULSSrr")>; def HWWriteResGroup91_1 : SchedWriteRes<[HWPort0,HWPort23,HWFPDivider]> { let Latency = 16; @@ -1655,6 +1829,15 @@ let ResourceCycles = [1,1]; } def: InstRW<[HWWriteResGroup91_2], (instregex "(V?)PCMPGTQrm", + "(V?)PMADDUBSWrm", + "(V?)PMADDWDrm", + "(V?)PMULDQrm", + "(V?)PMULHRSWrm", + "(V?)PMULHUWrm", + "(V?)PMULHWrm", + "(V?)PMULLWrm", + "(V?)PMULUDQrm", + "(V?)PSADBWrm", "(V?)RCPPSm", "(V?)RSQRTPSm")>; @@ -1664,21 +1847,32 @@ let ResourceCycles = [1,1]; } def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F(32|64)m", - "VPCMPGTQYrm")>; + "VPCMPGTQYrm", + "VPMADDUBSWYrm", + "VPMADDWDYrm", + "VPMULDQYrm", + "VPMULHRSWYrm", + "VPMULHUWYrm", + "VPMULHWYrm", + "VPMULLWYrm", + "VPMULUDQYrm", + "VPSADBWYrm")>; + +def HWWriteResGroup92 : SchedWriteRes<[HWPort01,HWPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup92], (instregex "(V?)MULPDrm", + "(V?)MULPSrm")>; -def HWWriteResGroup91_5 : SchedWriteRes<[HWPort0,HWPort23]> { - let Latency = 10; +def HWWriteResGroup92_1 : SchedWriteRes<[HWPort01,HWPort23]> { + let Latency = 12; let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup91_5], (instregex "MMX_PMADDUBSWrm", - "MMX_PMADDWDirm", - "MMX_PMULHRSWrm", - "MMX_PMULHUWirm", - "MMX_PMULHWirm", - "MMX_PMULLWirm", - "MMX_PMULUDQirm", - "MMX_PSADBWirm")>; +def: InstRW<[HWWriteResGroup92_1], (instregex "VMULPDYrm", + "VMULPSYrm")>; def HWWriteResGroup92_2 : SchedWriteRes<[HWPort01,HWPort23]> { let Latency = 10; @@ -1818,6 +2012,13 @@ def: InstRW<[HWWriteResGroup109], (instregex "SHLD(16|32|64)mrCL", "SHRD(16|32|64)mrCL")>; +def HWWriteResGroup113_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[HWWriteResGroup113_1], (instregex "VMPSADBWYrmi")>; + def HWWriteResGroup114 : SchedWriteRes<[HWPort6,HWPort06,HWPort15,HWPort0156]> { let Latency = 7; let NumMicroOps = 7; @@ -1832,6 +2033,27 @@ } def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI(16|32)m")>; +def HWWriteResGroup116 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup116], (instregex "(V?)DPPDrri")>; + +def HWWriteResGroup117 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> { + let Latency = 15; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup117], (instregex "(V?)DPPDrmi")>; + +def HWWriteResGroup119_1 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 17; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup119_1], (instregex "VPMULLDYrm")>; + def HWWriteResGroup120 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> { let Latency = 16; let NumMicroOps = 10; @@ -1921,6 +2143,27 @@ } def: InstRW<[HWWriteResGroup138], (instregex "(V?)SQRTPSm")>; +def HWWriteResGroup140 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[HWWriteResGroup140], (instregex "(V?)DPPS(Y?)rri")>; + +def HWWriteResGroup141 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> { + let Latency = 20; + let NumMicroOps = 5; + let ResourceCycles = [2,1,1,1]; +} +def: InstRW<[HWWriteResGroup141], (instregex "(V?)DPPSrmi")>; + +def HWWriteResGroup141_1 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> { + let Latency = 21; + let NumMicroOps = 5; + let ResourceCycles = [2,1,1,1]; +} +def: InstRW<[HWWriteResGroup141_1], (instregex "VDPPSYrmi")>; + def HWWriteResGroup142 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> { let Latency = 14; let NumMicroOps = 10; Index: lib/Target/X86/X86SchedSandyBridge.td =================================================================== --- lib/Target/X86/X86SchedSandyBridge.td +++ lib/Target/X86/X86SchedSandyBridge.td @@ -153,19 +153,16 @@ defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; -defm : SBWriteResPair; +defm : SBWriteResPair; defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; @@ -177,37 +174,20 @@ def : WriteRes { let Latency = 6; } def : WriteRes; +defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; // TODO this is probably wrong for 256/512-bit for the "generic" model -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; // TODO this is probably wrong for 256/512-bit for the "generic" model +defm : SBWriteResPair; +defm : SBWriteResPair; defm : SBWriteResPair; -defm : SBWriteResPair; defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; defm : SBWriteResPair; -// Vector integer shifts. -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; - // Vector insert/extract operations. def : WriteRes { let Latency = 2; @@ -234,7 +214,6 @@ defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; -defm : SBWriteResPair; //////////////////////////////////////////////////////////////////////////////// // String instructions. @@ -343,10 +322,11 @@ // AVX2/FMA is not supported on that architecture, but we should define the basic // scheduling resources anyway. -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; @@ -359,6 +339,14 @@ let ResourceCycles = [1]; } def: InstRW<[SBWriteResGroup0], (instregex "(V?)CVTSS2SDrr", + "(V?)PSLLDri", + "(V?)PSLLQri", + "(V?)PSLLWri", + "(V?)PSRADri", + "(V?)PSRAWri", + "(V?)PSRLDri", + "(V?)PSRLQri", + "(V?)PSRLWri", "VTESTPD(Y?)rr", "VTESTPS(Y?)rr")>; @@ -384,6 +372,9 @@ "RETQ", "ST_FPrr", "ST_Frr", + "VEXTRACTF128rr", + "VINSERTF128rr", + "VPERM2F128rr", "(V?)MOV64toPQIrr", "(V?)MOVDI2PDIrr")>; @@ -410,12 +401,47 @@ let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[SBWriteResGroup5], (instregex "MMX_PALIGNRrri", +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABS(B|D|W)rr", + "MMX_PADDQirr", + "MMX_PALIGNRrri", + "MMX_PSIGN(B|D|W)rr", + "(V?)PABSBrr", + "(V?)PABSDrr", + "(V?)PABSWrr", "(V?)PACKSSDWrr", "(V?)PACKSSWBrr", "(V?)PACKUSDWrr", "(V?)PACKUSWBrr", + "(V?)PADDBrr", + "(V?)PADDDrr", + "(V?)PADDQrr", + "(V?)PADDSBrr", + "(V?)PADDSWrr", + "(V?)PADDUSBrr", + "(V?)PADDUSWrr", + "(V?)PADDWrr", "(V?)PALIGNRrri", + "(V?)PAVGBrr", + "(V?)PAVGWrr", + "(V?)PCMPEQBrr", + "(V?)PCMPEQDrr", + "(V?)PCMPEQQrr", + "(V?)PCMPEQWrr", + "(V?)PCMPGTBrr", + "(V?)PCMPGTDrr", + "(V?)PCMPGTWrr", + "(V?)PMAXSBrr", + "(V?)PMAXSDrr", + "(V?)PMAXSWrr", + "(V?)PMAXUBrr", + "(V?)PMAXUDrr", + "(V?)PMAXUWrr", + "(V?)PMINSBrr", + "(V?)PMINSDrr", + "(V?)PMINSWrr", + "(V?)PMINUBrr", + "(V?)PMINUDrr", + "(V?)PMINUWrr", "(V?)PMOVSXBDrr", "(V?)PMOVSXBQrr", "(V?)PMOVSXBWrr", @@ -431,8 +457,19 @@ "(V?)PSHUFDri", "(V?)PSHUFHWri", "(V?)PSHUFLWri", + "(V?)PSIGNBrr", + "(V?)PSIGNDrr", + "(V?)PSIGNWrr", "(V?)PSLLDQri", "(V?)PSRLDQri", + "(V?)PSUBBrr", + "(V?)PSUBDrr", + "(V?)PSUBQrr", + "(V?)PSUBSBrr", + "(V?)PSUBSWrr", + "(V?)PSUBUSBrr", + "(V?)PSUBUSWrr", + "(V?)PSUBWrr", "(V?)PUNPCKHBWrr", "(V?)PUNPCKHDQrr", "(V?)PUNPCKHQDQrr", @@ -498,6 +535,20 @@ def: InstRW<[SBWriteResGroup13], (instregex "(V?)CVTPS2PD(Y?)rr", "(V?)PTEST(Y?)rr")>; +def SBWriteResGroup14 : SchedWriteRes<[SBPort0,SBPort15]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup14], (instregex "(V?)PSLLDrr", + "(V?)PSLLQrr", + "(V?)PSLLWrr", + "(V?)PSRADrr", + "(V?)PSRAWrr", + "(V?)PSRLDrr", + "(V?)PSRLQrr", + "(V?)PSRLWrr")>; + def SBWriteResGroup15 : SchedWriteRes<[SBPort0,SBPort015]> { let Latency = 2; let NumMicroOps = 2; @@ -548,12 +599,6 @@ let ResourceCycles = [1]; } def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTPI2PSirr", - "MMX_PADD(B|D|W)irr", - "MMX_P(ADD|SUB)(U?)S(B|W)irr", - "MMX_PAVG(B|W)irr", - "MMX_PCMP(EQ|GT)(B|D|W)irr", - "MMX_P(MAX|MIN)(SW|UB)irr", - "MMX_PSUB(B|D|Q|W)irr", "PUSHFS64", "(V?)CVTDQ2PS(Y?)rr")>; @@ -842,7 +887,6 @@ } def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABS(B|D|W)rm", "MMX_PALIGNRrmi", - "MMX_PSHUFBrm", "MMX_PSIGN(B|D|W)rm")>; def SBWriteResGroup52 : SchedWriteRes<[SBPort23,SBPort015]> { @@ -892,6 +936,28 @@ "VTESTPDrm", "VTESTPSrm")>; +def SBWriteResGroup56 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup56], (instregex "VBROADCASTF128", + "(V?)INSERTPSrm", + "(V?)MOVHPDrm", + "(V?)MOVHPSrm", + "(V?)MOVLPDrm", + "(V?)MOVLPSrm", + "VPERMILPDmi", + "VPERMILPDrm", + "VPERMILPSmi", + "VPERMILPSrm", + "(V?)SHUFPDrmi", + "(V?)SHUFPSrmi", + "(V?)UNPCKHPDrm", + "(V?)UNPCKHPSrm", + "(V?)UNPCKLPDrm", + "(V?)UNPCKLPSrm")>; + def SBWriteResGroup58 : SchedWriteRes<[SBPort23,SBPort05]> { let Latency = 7; let NumMicroOps = 2; @@ -904,11 +970,44 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SBWriteResGroup59], (instregex "(V?)PACKSSDWrm", +def: InstRW<[SBWriteResGroup59], (instregex "MMX_PADDQirm", + "(V?)PABSBrm", + "(V?)PABSDrm", + "(V?)PABSWrm", + "(V?)PACKSSDWrm", "(V?)PACKSSWBrm", "(V?)PACKUSDWrm", "(V?)PACKUSWBrm", + "(V?)PADDBrm", + "(V?)PADDDrm", + "(V?)PADDQrm", + "(V?)PADDSBrm", + "(V?)PADDSWrm", + "(V?)PADDUSBrm", + "(V?)PADDUSWrm", + "(V?)PADDWrm", "(V?)PALIGNRrmi", + "(V?)PAVGBrm", + "(V?)PAVGWrm", + "(V?)PCMPEQBrm", + "(V?)PCMPEQDrm", + "(V?)PCMPEQQrm", + "(V?)PCMPEQWrm", + "(V?)PCMPGTBrm", + "(V?)PCMPGTDrm", + "(V?)PCMPGTWrm", + "(V?)PMAXSBrm", + "(V?)PMAXSDrm", + "(V?)PMAXSWrm", + "(V?)PMAXUBrm", + "(V?)PMAXUDrm", + "(V?)PMAXUWrm", + "(V?)PMINSBrm", + "(V?)PMINSDrm", + "(V?)PMINSWrm", + "(V?)PMINUBrm", + "(V?)PMINUDrm", + "(V?)PMINUWrm", "(V?)PMOVSXBDrm", "(V?)PMOVSXBQrm", "(V?)PMOVSXBWrm", @@ -921,9 +1020,21 @@ "(V?)PMOVZXDQrm", "(V?)PMOVZXWDrm", "(V?)PMOVZXWQrm", + "(V?)PSHUFBrm", "(V?)PSHUFDmi", "(V?)PSHUFHWmi", "(V?)PSHUFLWmi", + "(V?)PSIGNBrm", + "(V?)PSIGNDrm", + "(V?)PSIGNWrm", + "(V?)PSUBBrm", + "(V?)PSUBDrm", + "(V?)PSUBQrm", + "(V?)PSUBSBrm", + "(V?)PSUBSWrm", + "(V?)PSUBUSBrm", + "(V?)PSUBUSWrm", + "(V?)PSUBWrm", "(V?)PUNPCKHBWrm", "(V?)PUNPCKHDQrm", "(V?)PUNPCKHQDQrm", @@ -933,18 +1044,6 @@ "(V?)PUNPCKLQDQrm", "(V?)PUNPCKLWDrm")>; -def SBWriteResGroup59a : SchedWriteRes<[SBPort23,SBPort1]> { - let Latency = 8; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SBWriteResGroup59a], (instregex "MMX_PADD(B|D|W)irm", - "MMX_P(ADD|SUB)(U?)S(B|W)irm", - "MMX_PAVG(B|W)irm", - "MMX_PCMP(EQ|GT)(B|D|W)irm", - "MMX_P(MAX|MIN)(SW|UB)irm", - "MMX_PSUB(B|D|Q|W)irm")>; - def SBWriteResGroup61 : SchedWriteRes<[SBPort0,SBPort05]> { let Latency = 7; let NumMicroOps = 3; @@ -1036,6 +1135,15 @@ } def: InstRW<[SBWriteResGroup72], (instrs MUL8m)>; +def SBWriteResGroup73 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup73], (instregex "VPERM2F128rm", + "VPERMILPDYrm", + "VPERMILPSYrm")>; + def SBWriteResGroup75 : SchedWriteRes<[SBPort23,SBPort05]> { let Latency = 8; let NumMicroOps = 3; @@ -1061,6 +1169,20 @@ } def: InstRW<[SBWriteResGroup78], (instregex "(V?)PTESTrm")>; +def SBWriteResGroup79 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup79], (instregex "(V?)PSLLDrm", + "(V?)PSLLQrm", + "(V?)PSLLWrm", + "(V?)PSRADrm", + "(V?)PSRAWrm", + "(V?)PSRLDrm", + "(V?)PSRLQrm", + "(V?)PSRLWrm")>; + def SBWriteResGroup80 : SchedWriteRes<[SBPort23,SBPort15]> { let Latency = 8; let NumMicroOps = 4; @@ -1135,19 +1257,34 @@ def: InstRW<[SBWriteResGroup88], (instregex "SHLD(16|32|64)mri8", "SHRD(16|32|64)mri8")>; +def SBWriteResGroup89 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup89], (instregex "(V?)PMADDUBSWrm", + "(V?)PMADDWDrm", + "(V?)PMULDQrm", + "(V?)PMULHRSWrm", + "(V?)PMULHUWrm", + "(V?)PMULHWrm", + "(V?)PMULLWrm", + "(V?)PMULUDQrm", + "(V?)PSADBWrm")>; + def SBWriteResGroup89_2 : SchedWriteRes<[SBPort0,SBPort23]> { let Latency = 10; let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SBWriteResGroup89_2], (instregex "MMX_PMADDUBSWrm", - "MMX_PMADDWDirm", - "MMX_PMULHRSWrm", - "MMX_PMULHUWirm", - "MMX_PMULHWirm", - "MMX_PMULLWirm", - "MMX_PMULUDQirm", - "MMX_PSADBWirm")>; +def: InstRW<[SBWriteResGroup89], (instregex "MMX_PMADDUBSWrm", + "MMX_PMADDWDirm", + "MMX_PMULHRSWrm", + "MMX_PMULHUWirm", + "MMX_PMULHWirm", + "MMX_PMULLWirm", + "MMX_PMULUDQirm", + "MMX_PSADBWirm")>; def SBWriteResGroup90 : SchedWriteRes<[SBPort1,SBPort23]> { let Latency = 9; @@ -1171,6 +1308,13 @@ def: InstRW<[SBWriteResGroup91], (instregex "VMASKMOVPDYrm", "VMASKMOVPSYrm")>; +def SBWriteResGroup92 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup92], (instregex "(V?)DPPDrri")>; + def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { let Latency = 9; let NumMicroOps = 3; @@ -1354,6 +1498,13 @@ } def: InstRW<[SBWriteResGroup111], (instregex "MUL_F(32|64)m")>; +def SBWriteResGroup112 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> { + let Latency = 12; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup112], (instregex "(V?)DPPS(Y?)rri")>; + def SBWriteResGroup114 : SchedWriteRes<[SBPort1,SBPort23]> { let Latency = 13; let NumMicroOps = 3; @@ -1386,6 +1537,27 @@ } def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI(16|32)m")>; +def SBWriteResGroup120 : SchedWriteRes<[SBPort0,SBPort1,SBPort5,SBPort23]> { + let Latency = 15; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SBWriteResGroup120], (instregex "(V?)DPPDrmi")>; + +def SBWriteResGroup121 : SchedWriteRes<[SBPort0,SBPort1,SBPort5,SBPort23]> { + let Latency = 18; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SBWriteResGroup121], (instregex "(V?)DPPSrmi")>; + +def SBWriteResGroup122 : SchedWriteRes<[SBPort0,SBPort1,SBPort5,SBPort23]> { + let Latency = 19; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SBWriteResGroup122], (instregex "VDPPSYrmi")>; + def SBWriteResGroup123 : SchedWriteRes<[SBPort0,SBPort23,SBFPDivider]> { let Latency = 20; let NumMicroOps = 2; Index: lib/Target/X86/X86SchedSkylakeClient.td =================================================================== --- lib/Target/X86/X86SchedSkylakeClient.td +++ lib/Target/X86/X86SchedSkylakeClient.td @@ -156,8 +156,8 @@ defm : SKLWriteResPair; // Floating point compare. defm : SKLWriteResPair; // Floating point compare (YMM/ZMM). defm : SKLWriteResPair; // Floating point compare to flags. -defm : SKLWriteResPair; // Floating point multiplication. -defm : SKLWriteResPair; // Floating point multiplication (YMM/ZMM). +defm : SKLWriteResPair; // Floating point multiplication. +defm : SKLWriteResPair; // Floating point multiplication (YMM/ZMM). defm : SKLWriteResPair; // 10-14 cycles. // Floating point division. defm : SKLWriteResPair; // 10-14 cycles. // Floating point division (YMM/ZMM). defm : SKLWriteResPair; // Floating point square root. @@ -169,16 +169,13 @@ defm : SKLWriteResPair; // Fused Multiply Add. defm : SKLWriteResPair; // Fused Multiply Add (Scalar). defm : SKLWriteResPair; // Fused Multiply Add (YMM/ZMM). -defm : SKLWriteResPair; // Floating point double dot product. -defm : SKLWriteResPair; // Floating point single dot product. -defm : SKLWriteResPair; // Floating point single dot product (YMM). defm : SKLWriteResPair; // Floating point fabs/fchs. defm : SKLWriteResPair; // Floating point and/or/xor logicals. defm : SKLWriteResPair; // Floating point and/or/xor logicals (YMM/ZMM). -defm : SKLWriteResPair; // Floating point vector shuffles. +defm : SKLWriteResPair; // Floating point vector shuffles. defm : SKLWriteResPair; // Floating point vector shuffles (YMM/ZMM). -defm : SKLWriteResPair; // Floating point vector shuffles. -defm : SKLWriteResPair; // Floating point vector shuffles. +defm : SKLWriteResPair; // Floating point vector shuffles. +defm : SKLWriteResPair; // Floating point vector shuffles. defm : SKLWriteResPair; // Floating point vector blends. defm : SKLWriteResPair; // Floating point vector blends. defm : SKLWriteResPair; // Fp vector variable blends. @@ -198,39 +195,20 @@ def : WriteRes; def : WriteRes; -defm : SKLWriteResPair; // Vector integer ALU op, no logicals. -defm : SKLWriteResPair; // Vector integer ALU op, no logicals (YMM/ZMM). +defm : SKLWriteResPair; // Vector integer ALU op, no logicals. defm : SKLWriteResPair; // Vector integer and/or/xor. defm : SKLWriteResPair; // Vector integer and/or/xor (YMM/ZMM). -defm : SKLWriteResPair; // Vector integer multiply. -defm : SKLWriteResPair; // Vector integer multiply (YMM/ZMM). -defm : SKLWriteResPair; // Vector PMULLD. -defm : SKLWriteResPair; // Vector PMULLD (YMM/ZMM). -defm : SKLWriteResPair; // Vector shuffles. -defm : SKLWriteResPair; // Vector shuffles (YMM/ZMM). -defm : SKLWriteResPair; // Vector shuffles. -defm : SKLWriteResPair; // Vector shuffles (YMM/ZMM). +defm : SKLWriteResPair; // Vector integer shifts. +defm : SKLWriteResPair; // Vector integer multiply. +defm : SKLWriteResPair; +defm : SKLWriteResPair; // Vector shuffles. +defm : SKLWriteResPair; // Vector shuffles. defm : SKLWriteResPair; // Vector blends. -defm : SKLWriteResPair; // Vector blends (YMM/ZMM). defm : SKLWriteResPair; // Vector variable blends. -defm : SKLWriteResPair; // Vector variable blends (YMM/ZMM). defm : SKLWriteResPair; // Vector MPSAD. -defm : SKLWriteResPair; // Vector MPSAD. -defm : SKLWriteResPair; // Vector PSADBW. -defm : SKLWriteResPair; // Vector PSADBW. +defm : SKLWriteResPair; // Vector PSADBW. defm : SKLWriteResPair; // Vector PHMINPOS. -// Vector integer shifts. -defm : SKLWriteResPair; -defm : SKLWriteResPair; -defm : X86WriteRes; -defm : X86WriteRes; - -defm : SKLWriteResPair; // Vector integer immediate shifts (XMM). -defm : SKLWriteResPair; // Vector integer immediate shifts (YMM/ZMM). -defm : SKLWriteResPair; // Variable vector shifts. -defm : SKLWriteResPair; // Variable vector shifts (YMM/ZMM). - // Vector insert/extract operations. def : WriteRes { let Latency = 2; @@ -241,7 +219,6 @@ let Latency = 6; let NumMicroOps = 2; } -def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>; def : WriteRes { let Latency = 3; @@ -362,10 +339,11 @@ def : WriteRes { let Latency = 100; } // def WriteSystem : SchedWrite; // AVX2. -defm : SKLWriteResPair; // Fp 256-bit width vector shuffles. -defm : SKLWriteResPair; // Fp 256-bit width vector variable shuffles. -defm : SKLWriteResPair; // 256-bit width vector shuffles. -defm : SKLWriteResPair; // 256-bit width vector variable shuffles. +defm : SKLWriteResPair; // Fp 256-bit width vector shuffles. +defm : SKLWriteResPair; // Fp 256-bit width vector variable shuffles. +defm : SKLWriteResPair; // 256-bit width vector shuffles. +defm : SKLWriteResPair; // 256-bit width vector variable shuffles. +defm : SKLWriteResPair; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -386,8 +364,7 @@ defm : SKLWriteResPair; defm : SKLWriteResPair; -defm : SKLWriteResPair; -defm : SKLWriteResPair; +defm : SKLWriteResPair; // Remaining instrs. @@ -438,6 +415,60 @@ } def: InstRW<[SKLWriteResGroup4], (instregex "JMP(16|32|64)r")>; +def SKLWriteResGroup5 : SchedWriteRes<[SKLPort01]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup5], (instregex "(V?)PABSB(Y?)rr", + "(V?)PABSD(Y?)rr", + "(V?)PABSW(Y?)rr", + "(V?)PADDSB(Y?)rr", + "(V?)PADDSW(Y?)rr", + "(V?)PADDUSB(Y?)rr", + "(V?)PADDUSW(Y?)rr", + "(V?)PAVGB(Y?)rr", + "(V?)PAVGW(Y?)rr", + "(V?)PCMPEQB(Y?)rr", + "(V?)PCMPEQD(Y?)rr", + "(V?)PCMPEQQ(Y?)rr", + "(V?)PCMPEQW(Y?)rr", + "(V?)PCMPGTB(Y?)rr", + "(V?)PCMPGTD(Y?)rr", + "(V?)PCMPGTW(Y?)rr", + "(V?)PMAXSB(Y?)rr", + "(V?)PMAXSD(Y?)rr", + "(V?)PMAXSW(Y?)rr", + "(V?)PMAXUB(Y?)rr", + "(V?)PMAXUD(Y?)rr", + "(V?)PMAXUW(Y?)rr", + "(V?)PMINSB(Y?)rr", + "(V?)PMINSD(Y?)rr", + "(V?)PMINSW(Y?)rr", + "(V?)PMINUB(Y?)rr", + "(V?)PMINUD(Y?)rr", + "(V?)PMINUW(Y?)rr", + "(V?)PSIGNB(Y?)rr", + "(V?)PSIGND(Y?)rr", + "(V?)PSIGNW(Y?)rr", + "(V?)PSLLD(Y?)ri", + "(V?)PSLLQ(Y?)ri", + "VPSLLVD(Y?)rr", + "VPSLLVQ(Y?)rr", + "(V?)PSLLW(Y?)ri", + "(V?)PSRAD(Y?)ri", + "VPSRAVD(Y?)rr", + "(V?)PSRAW(Y?)ri", + "(V?)PSRLD(Y?)ri", + "(V?)PSRLQ(Y?)ri", + "VPSRLVD(Y?)rr", + "VPSRLVQ(Y?)rr", + "(V?)PSRLW(Y?)ri", + "(V?)PSUBSB(Y?)rr", + "(V?)PSUBSW(Y?)rr", + "(V?)PSUBUSB(Y?)rr", + "(V?)PSUBUSW(Y?)rr")>; + def SKLWriteResGroup6 : SchedWriteRes<[SKLPort05]> { let Latency = 1; let NumMicroOps = 1; @@ -612,6 +643,20 @@ "VPMASKMOVD(Y?)mr", "VPMASKMOVQ(Y?)mr")>; +def SKLWriteResGroup19 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup19], (instregex "(V?)PSLLDrr", + "(V?)PSLLQrr", + "(V?)PSLLWrr", + "(V?)PSRADrr", + "(V?)PSRAWrr", + "(V?)PSRLDrr", + "(V?)PSRLQrr", + "(V?)PSRLWrr")>; + def SKLWriteResGroup20 : SchedWriteRes<[SKLPort6,SKLPort0156]> { let Latency = 2; let NumMicroOps = 2; @@ -774,6 +819,16 @@ } def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PH(ADD|SUB)(D|W)rr")>; +def SKLWriteResGroup38 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup38], (instregex "(V?)PHADDD(Y?)rr", + "(V?)PHADDW(Y?)rr", + "(V?)PHSUBD(Y?)rr", + "(V?)PHSUBW(Y?)rr")>; + def SKLWriteResGroup39 : SchedWriteRes<[SKLPort5,SKLPort0156]> { let Latency = 3; let NumMicroOps = 3; @@ -858,7 +913,19 @@ } def: InstRW<[SKLWriteResGroup48], (instregex "(V?)CVTDQ2PS(Y?)rr", "(V?)CVTPS2DQ(Y?)rr", - "(V?)CVTTPS2DQ(Y?)rr")>; + "(V?)CVTTPS2DQ(Y?)rr", + "(V?)MULPD(Y?)rr", + "(V?)MULPS(Y?)rr", + "(V?)MULSDrr", + "(V?)MULSSrr", + "(V?)PMADDUBSW(Y?)rr", + "(V?)PMADDWD(Y?)rr", + "(V?)PMULDQ(Y?)rr", + "(V?)PMULHRSW(Y?)rr", + "(V?)PMULHUW(Y?)rr", + "(V?)PMULHW(Y?)rr", + "(V?)PMULLW(Y?)rr", + "(V?)PMULUDQ(Y?)rr")>; def SKLWriteResGroup51 : SchedWriteRes<[SKLPort1,SKLPort5]> { let Latency = 4; @@ -874,6 +941,20 @@ } def: InstRW<[SKLWriteResGroup51_16], (instrs IMUL16r, MUL16r)>; +def SKLWriteResGroup52 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLDYrr", + "VPSLLQYrr", + "VPSLLWYrr", + "VPSRADYrr", + "VPSRAWYrr", + "VPSRLDYrr", + "VPSRLQYrr", + "VPSRLWYrr")>; + def SKLWriteResGroup53 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> { let Latency = 4; let NumMicroOps = 3; @@ -1183,13 +1264,19 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKLWriteResGroup88], (instregex "(V?)PACKSSDWrm", +def: InstRW<[SKLWriteResGroup88], (instregex "(V?)INSERTPSrm", + "(V?)PACKSSDWrm", "(V?)PACKSSWBrm", "(V?)PACKUSDWrm", "(V?)PACKUSWBrm", "(V?)PALIGNRrmi", "VPBROADCASTBrm", "VPBROADCASTWrm", + "VPERMILPDmi", + "VPERMILPDrm", + "VPERMILPSmi", + "VPERMILPSrm", + "(V?)PSHUFBrm", "(V?)PSHUFDmi", "(V?)PSHUFHWmi", "(V?)PSHUFLWmi", @@ -1200,14 +1287,13 @@ "(V?)PUNPCKLBWrm", "(V?)PUNPCKLDQrm", "(V?)PUNPCKLQDQrm", - "(V?)PUNPCKLWDrm")>; - -def SKLWriteResGroup88a : SchedWriteRes<[SKLPort5,SKLPort23]> { - let Latency = 6; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SKLWriteResGroup88a], (instregex "MMX_PSHUFBrm")>; + "(V?)PUNPCKLWDrm", + "(V?)SHUFPDrmi", + "(V?)SHUFPSrmi", + "(V?)UNPCKHPDrm", + "(V?)UNPCKHPSrm", + "(V?)UNPCKLPDrm", + "(V?)UNPCKLPSrm")>; def SKLWriteResGroup89 : SchedWriteRes<[SKLPort5,SKLPort01]> { let Latency = 7; @@ -1226,14 +1312,54 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKLWriteResGroup90], (instregex "(V?)PSLLDrm", +def: InstRW<[SKLWriteResGroup90], (instregex "(V?)PABSBrm", + "(V?)PABSDrm", + "(V?)PABSWrm", + "(V?)PADDSBrm", + "(V?)PADDSWrm", + "(V?)PADDUSBrm", + "(V?)PADDUSWrm", + "(V?)PAVGBrm", + "(V?)PAVGWrm", + "(V?)PCMPEQBrm", + "(V?)PCMPEQDrm", + "(V?)PCMPEQQrm", + "(V?)PCMPEQWrm", + "(V?)PCMPGTBrm", + "(V?)PCMPGTDrm", + "(V?)PCMPGTWrm", + "(V?)PMAXSBrm", + "(V?)PMAXSDrm", + "(V?)PMAXSWrm", + "(V?)PMAXUBrm", + "(V?)PMAXUDrm", + "(V?)PMAXUWrm", + "(V?)PMINSBrm", + "(V?)PMINSDrm", + "(V?)PMINSWrm", + "(V?)PMINUBrm", + "(V?)PMINUDrm", + "(V?)PMINUWrm", + "(V?)PSIGNBrm", + "(V?)PSIGNDrm", + "(V?)PSIGNWrm", + "(V?)PSLLDrm", "(V?)PSLLQrm", + "VPSLLVDrm", + "VPSLLVQrm", "(V?)PSLLWrm", "(V?)PSRADrm", + "VPSRAVDrm", "(V?)PSRAWrm", "(V?)PSRLDrm", "(V?)PSRLQrm", - "(V?)PSRLWrm")>; + "(V?)PSRLVDrm", + "VPSRLVQrm", + "(V?)PSRLWrm", + "(V?)PSUBSBrm", + "(V?)PSUBSWrm", + "(V?)PSUBUSBrm", + "(V?)PSUBUSWrm")>; def SKLWriteResGroup91 : SchedWriteRes<[SKLPort23,SKLPort015]> { let Latency = 7; @@ -1384,12 +1510,85 @@ "FCOM64m", "FCOMP32m", "FCOMP64m", - "MMX_PSADBWirm", // TODO - SKLWriteResGroup120?? + "VPACKSSDWYrm", + "VPACKSSWBYrm", + "VPACKUSDWYrm", + "VPACKUSWBYrm", + "VPALIGNRYrmi", + "VPBLENDWYrmi", "VPBROADCASTBYrm", "VPBROADCASTWYrm", + "VPERMILPDYrm", + "VPERMILPSYrm", "VPMOVSXBDYrm", "VPMOVSXBQYrm", - "VPMOVSXWQYrm")>; + "VPMOVSXWQYrm", + "VPSHUFBYrm", + "VPSHUFDYmi", + "VPSHUFHWYmi", + "VPSHUFLWYmi", + "VPUNPCKHBWYrm", + "VPUNPCKHDQYrm", + "VPUNPCKHQDQYrm", + "VPUNPCKHWDYrm", + "VPUNPCKLBWYrm", + "VPUNPCKLDQYrm", + "VPUNPCKLQDQYrm", + "VPUNPCKLWDYrm")>; + +def SKLWriteResGroup109 : SchedWriteRes<[SKLPort01,SKLPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup109], (instregex "VPABSBYrm", + "VPABSDYrm", + "VPABSWYrm", + "VPADDSBYrm", + "VPADDSWYrm", + "VPADDUSBYrm", + "VPADDUSWYrm", + "VPAVGBYrm", + "VPAVGWYrm", + "VPCMPEQBYrm", + "VPCMPEQDYrm", + "VPCMPEQQYrm", + "VPCMPEQWYrm", + "VPCMPGTBYrm", + "VPCMPGTDYrm", + "VPCMPGTWYrm", + "VPMAXSBYrm", + "VPMAXSDYrm", + "VPMAXSWYrm", + "VPMAXUBYrm", + "VPMAXUDYrm", + "VPMAXUWYrm", + "VPMINSBYrm", + "VPMINSDYrm", + "VPMINSWYrm", + "VPMINUBYrm", + "VPMINUDYrm", + "VPMINUWYrm", + "VPSIGNBYrm", + "VPSIGNDYrm", + "VPSIGNWYrm", + "VPSLLDYrm", + "VPSLLQYrm", + "VPSLLVDYrm", + "VPSLLVQYrm", + "VPSLLWYrm", + "VPSRADYrm", + "VPSRAVDYrm", + "VPSRAWYrm", + "VPSRLDYrm", + "VPSRLQYrm", + "VPSRLVDYrm", + "VPSRLVQYrm", + "VPSRLWYrm", + "VPSUBSBYrm", + "VPSUBSWYrm", + "VPSUBUSBYrm", + "VPSUBUSWYrm")>; def SKLWriteResGroup110 : SchedWriteRes<[SKLPort23,SKLPort015]> { let Latency = 8; @@ -1494,7 +1693,8 @@ "VPMOVSXBWYrm", "VPMOVSXDQYrm", "VPMOVSXWDYrm", - "VPMOVZXWDYrm")>; + "VPMOVZXWDYrm", + "(V?)PSADBWrm")>; def SKLWriteResGroup122 : SchedWriteRes<[SKLPort01,SKLPort23]> { let Latency = 9; @@ -1524,6 +1724,13 @@ "VCVTPH2PSrm", "(V?)CVTPS2PDrm")>; +def SKLWriteResGroup124 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup124], (instregex "(V?)DPPDrri")>; + def SKLWriteResGroup126 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { let Latency = 9; let NumMicroOps = 3; @@ -1546,6 +1753,16 @@ def: InstRW<[SKLWriteResGroup128], (instregex "(V?)PHADDSWrm", "(V?)PHSUBSWrm")>; +def SKLWriteResGroup129 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup129], (instregex "(V?)PHADDDrm", + "(V?)PHADDWrm", + "(V?)PHSUBDrm", + "(V?)PHSUBWrm")>; + def SKLWriteResGroup130 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort0156]> { let Latency = 9; let NumMicroOps = 4; @@ -1578,11 +1795,18 @@ def: InstRW<[SKLWriteResGroup133], (instregex "(ADD|SUB|SUBR)_F(32|64)m", "ILD_F(16|32|64)m", "VPCMPGTQYrm", + "VPERM2F128rm", + "VPERM2I128rm", + "VPERMDYrm", + "VPERMPDYmi", + "VPERMPSYrm", + "VPERMQYmi", "VPMOVZXBDYrm", "VPMOVZXBQYrm", "VPMOVZXBWYrm", "VPMOVZXDQYrm", - "VPMOVZXWQYrm")>; + "VPMOVZXWQYrm", + "VPSADBWYrm")>; def SKLWriteResGroup134 : SchedWriteRes<[SKLPort01,SKLPort23]> { let Latency = 10; @@ -1593,7 +1817,17 @@ "(V?)CVTPH2PSYrm", "(V?)CVTPS2DQrm", "(V?)CVTSS2SDrm", - "(V?)CVTTPS2DQrm")>; + "(V?)CVTTPS2DQrm", + "(V?)MULPDrm", + "(V?)MULPSrm", + "(V?)PMADDUBSWrm", + "(V?)PMADDWDrm", + "(V?)PMULDQrm", + "(V?)PMULHRSWrm", + "(V?)PMULHUWrm", + "(V?)PMULHWrm", + "(V?)PMULLWrm", + "(V?)PMULUDQrm")>; def SKLWriteResGroup138 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { let Latency = 10; @@ -1618,6 +1852,16 @@ def: InstRW<[SKLWriteResGroup140], (instregex "VPHADDSWYrm", "VPHSUBSWYrm")>; +def SKLWriteResGroup141 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup141], (instregex "VPHADDDYrm", + "VPHADDWYrm", + "VPHSUBDYrm", + "VPHSUBWYrm")>; + def SKLWriteResGroup142 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort06,SKLPort0156]> { let Latency = 9; let NumMicroOps = 4; @@ -1671,7 +1915,17 @@ def: InstRW<[SKLWriteResGroup147], (instregex "VCVTDQ2PSYrm", "VCVTPS2DQYrm", "VCVTPS2PDYrm", - "VCVTTPS2DQYrm")>; + "VCVTTPS2DQYrm", + "VMULPDYrm", + "VMULPSYrm", + "VPMADDUBSWYrm", + "VPMADDWDYrm", + "VPMULDQYrm", + "VPMULHRSWYrm", + "VPMULHUWYrm", + "VPMULHWYrm", + "VPMULLWYrm", + "VPMULUDQYrm")>; def SKLWriteResGroup149 : SchedWriteRes<[SKLPort5,SKLPort23]> { let Latency = 11; @@ -1681,7 +1935,8 @@ def: InstRW<[SKLWriteResGroup149], (instregex "FICOM16m", "FICOM32m", "FICOMP16m", - "FICOMP32m")>; + "FICOMP32m", + "VMPSADBWYrmi")>; def SKLWriteResGroup150 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { let Latency = 11; @@ -1781,6 +2036,13 @@ } def: InstRW<[SKLWriteResGroup163], (instregex "VCVTDQ2PDYrm")>; +def SKLWriteResGroup164 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKLWriteResGroup164], (instregex "(V?)DPPS(Y?)rri")>; + def SKLWriteResGroup166 : SchedWriteRes<[SKLPort0,SKLFPDivider]> { let Latency = 14; let NumMicroOps = 1; @@ -1837,6 +2099,20 @@ def: InstRW<[SKLWriteResGroup172], (instregex "VROUNDPDYm", "VROUNDPSYm")>; +def SKLWriteResGroup172_2 : SchedWriteRes<[SKLPort23,SKLPort01]> { + let Latency = 17; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup172_2], (instregex "VPMULLDYrm")>; + +def SKLWriteResGroup173 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> { + let Latency = 15; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKLWriteResGroup173], (instregex "(V?)DPPDrmi")>; + def SKLWriteResGroup174 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> { let Latency = 15; let NumMicroOps = 10; @@ -1943,6 +2219,13 @@ } def: InstRW<[SKLWriteResGroup186_1], (instregex "VSQRTPSYm")>; +def SKLWriteResGroup187 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> { + let Latency = 19; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKLWriteResGroup187], (instregex "(V?)DPPSrmi")>; + def SKLWriteResGroup189 : SchedWriteRes<[SKLPort0]> { let Latency = 20; let NumMicroOps = 1; @@ -1959,6 +2242,13 @@ } def: InstRW<[SKLWriteResGroup190], (instregex "(V?)DIVPDrm")>; +def SKLWriteResGroup191 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> { + let Latency = 20; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKLWriteResGroup191], (instregex "VDPPSYrmi")>; + def SKLWriteResGroup192 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { let Latency = 20; let NumMicroOps = 8; Index: lib/Target/X86/X86SchedSkylakeServer.td =================================================================== --- lib/Target/X86/X86SchedSkylakeServer.td +++ lib/Target/X86/X86SchedSkylakeServer.td @@ -169,16 +169,13 @@ defm : SKXWriteResPair; // Fused Multiply Add. defm : SKXWriteResPair; // Fused Multiply Add (Scalar). defm : SKXWriteResPair; // Fused Multiply Add (YMM/ZMM). -defm : SKXWriteResPair; // Floating point double dot product. -defm : SKXWriteResPair; // Floating point single dot product. -defm : SKXWriteResPair; // Floating point single dot product (YMM). defm : SKXWriteResPair; // Floating point fabs/fchs. defm : SKXWriteResPair; // Floating point and/or/xor logicals. defm : SKXWriteResPair; // Floating point and/or/xor logicals (YMM/ZMM). -defm : SKXWriteResPair; // Floating point vector shuffles. +defm : SKXWriteResPair; // Floating point vector shuffles. defm : SKXWriteResPair; // Floating point vector shuffles (YMM/ZMM). -defm : SKXWriteResPair; // Floating point vector variable shuffles. -defm : SKXWriteResPair; // Floating point vector variable shuffles. +defm : SKXWriteResPair; // Floating point vector variable shuffles. +defm : SKXWriteResPair; // Floating point vector variable shuffles. defm : SKXWriteResPair; // Floating point vector blends. defm : SKXWriteResPair; // Floating point vector blends. defm : SKXWriteResPair; // Fp vector variable blends. @@ -198,40 +195,20 @@ def : WriteRes; def : WriteRes; -defm : SKXWriteResPair; // Vector integer ALU op, no logicals. -defm : SKXWriteResPair; // Vector integer ALU op, no logicals (YMM/ZMM). +defm : SKXWriteResPair; // Vector integer ALU op, no logicals. defm : SKXWriteResPair; // Vector integer and/or/xor. defm : SKXWriteResPair; // Vector integer and/or/xor (YMM/ZMM). -defm : SKXWriteResPair; // Vector integer multiply. -defm : SKXWriteResPair; // Vector integer multiply (YMM/ZMM). -defm : SKXWriteResPair; // Vector PMULLD. -defm : SKXWriteResPair; // Vector PMULLD (YMM/ZMM). -defm : SKXWriteResPair; // Vector shuffles. -defm : SKXWriteResPair; // Vector shuffles (YMM/ZMM). -defm : SKXWriteResPair; // Vector variable shuffles. -defm : SKXWriteResPair; // Vector variable shuffles (YMM/ZMM). +defm : SKXWriteResPair; // Vector integer shifts. +defm : SKXWriteResPair; // Vector integer multiply. +defm : SKXWriteResPair; // Vector integer multiply. +defm : SKXWriteResPair; // Vector shuffles. +defm : SKXWriteResPair; // Vector variable shuffles. defm : SKXWriteResPair; // Vector blends. -defm : SKXWriteResPair; // Vector blends (YMM/ZMM). defm : SKXWriteResPair; // Vector variable blends. -defm : SKXWriteResPair; // Vector variable blends (YMM/ZMM). -defm : SKXWriteResPair; // Vector MPSAD. -defm : SKXWriteResPair; // Vector MPSAD. -defm : SKXWriteResPair; // Vector PSADBW. -defm : SKXWriteResPair; // Vector PSADBW. +defm : SKXWriteResPair; // Vector MPSAD. +defm : SKXWriteResPair; // Vector PSADBW. defm : SKXWriteResPair; // Vector PHMINPOS. -// Vector integer shifts. -defm : SKXWriteResPair; -defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; - -defm : SKXWriteResPair; // Vector integer immediate shifts (XMM). -defm : SKXWriteResPair; // Vector integer immediate shifts (YMM/ZMM). -defm : SKXWriteResPair; // Variable vector shifts. -defm : SKXWriteResPair; // Variable vector shifts (YMM/ZMM). - // Vector insert/extract operations. def : WriteRes { let Latency = 2; @@ -242,7 +219,6 @@ let Latency = 6; let NumMicroOps = 2; } -def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>; def : WriteRes { let Latency = 3; @@ -363,10 +339,11 @@ def : WriteRes { let Latency = 100; } // def WriteSystem : SchedWrite; // AVX2. -defm : SKXWriteResPair; // Fp 256-bit width vector shuffles. -defm : SKXWriteResPair; // Fp 256-bit width vector variable shuffles. -defm : SKXWriteResPair; // 256-bit width vector shuffles. -defm : SKXWriteResPair; // 256-bit width vector variable shuffles. +defm : SKXWriteResPair; // Fp 256-bit width vector shuffles. +defm : SKXWriteResPair; // Fp 256-bit width vector variable shuffles. +defm : SKXWriteResPair; // 256-bit width vector shuffles. +defm : SKXWriteResPair; // 256-bit width vector variable shuffles. +defm : SKXWriteResPair; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -387,8 +364,7 @@ defm : SKXWriteResPair; defm : SKXWriteResPair; -defm : SKXWriteResPair; -defm : SKXWriteResPair; +defm : SKXWriteResPair; // Remaining instrs. @@ -487,6 +463,254 @@ } def: InstRW<[SKXWriteResGroup4], (instregex "JMP(16|32|64)r")>; +def SKXWriteResGroup5 : SchedWriteRes<[SKXPort01]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBYrr", + "VPABSBZ128rr", + "VPABSBZ256rr", + "VPABSBZrr", + "(V?)PABSBrr", + "VPABSDYrr", + "VPABSDZ128rr", + "VPABSDZ256rr", + "VPABSDZrr", + "(V?)PABSDrr", + "VPABSQZ128rr", + "VPABSQZ256rr", + "VPABSQZrr", + "VPABSWYrr", + "VPABSWZ128rr", + "VPABSWZ256rr", + "VPABSWZrr", + "(V?)PABSWrr", + "VPADDSBYrr", + "VPADDSBZ128rr", + "VPADDSBZ256rr", + "VPADDSBZrr", + "(V?)PADDSBrr", + "VPADDSWYrr", + "VPADDSWZ128rr", + "VPADDSWZ256rr", + "VPADDSWZrr", + "(V?)PADDSWrr", + "VPADDUSBYrr", + "VPADDUSBZ128rr", + "VPADDUSBZ256rr", + "VPADDUSBZrr", + "(V?)PADDUSBrr", + "VPADDUSWYrr", + "VPADDUSWZ128rr", + "VPADDUSWZ256rr", + "VPADDUSWZrr", + "(V?)PADDUSWrr", + "VPAVGBYrr", + "VPAVGBZ128rr", + "VPAVGBZ256rr", + "VPAVGBZrr", + "(V?)PAVGBrr", + "VPAVGWYrr", + "VPAVGWZ128rr", + "VPAVGWZ256rr", + "VPAVGWZrr", + "(V?)PAVGWrr", + "(V?)PCMPEQB(Y?)rr", + "(V?)PCMPEQD(Y?)rr", + "(V?)PCMPEQQ(Y?)rr", + "(V?)PCMPEQW(Y?)rr", + "(V?)PCMPGTB(Y?)rr", + "(V?)PCMPGTD(Y?)rr", + "(V?)PCMPGTW(Y?)rr", + "VPMAXSBYrr", + "VPMAXSBZ128rr", + "VPMAXSBZ256rr", + "VPMAXSBZrr", + "(V?)PMAXSBrr", + "VPMAXSDYrr", + "VPMAXSDZ128rr", + "VPMAXSDZ256rr", + "VPMAXSDZrr", + "(V?)PMAXSDrr", + "VPMAXSWYrr", + "VPMAXSWZ128rr", + "VPMAXSWZ256rr", + "VPMAXSWZrr", + "(V?)PMAXSWrr", + "VPMAXUBYrr", + "VPMAXUBZ128rr", + "VPMAXUBZ256rr", + "VPMAXUBZrr", + "(V?)PMAXUBrr", + "VPMAXUDYrr", + "VPMAXUDZ128rr", + "VPMAXUDZ256rr", + "VPMAXUDZrr", + "(V?)PMAXUDrr", + "VPMAXUWYrr", + "VPMAXUWZ128rr", + "VPMAXUWZ256rr", + "VPMAXUWZrr", + "(V?)PMAXUWrr", + "VPMINSBYrr", + "VPMINSBZ128rr", + "VPMINSBZ256rr", + "VPMINSBZrr", + "(V?)PMINSBrr", + "VPMINSDYrr", + "VPMINSDZ128rr", + "VPMINSDZ256rr", + "VPMINSDZrr", + "(V?)PMINSDrr", + "VPMINSWYrr", + "VPMINSWZ128rr", + "VPMINSWZ256rr", + "VPMINSWZrr", + "(V?)PMINSWrr", + "VPMINUBYrr", + "VPMINUBZ128rr", + "VPMINUBZ256rr", + "VPMINUBZrr", + "(V?)PMINUBrr", + "VPMINUDYrr", + "VPMINUDZ128rr", + "VPMINUDZ256rr", + "VPMINUDZrr", + "(V?)PMINUDrr", + "VPMINUWYrr", + "VPMINUWZ128rr", + "VPMINUWZ256rr", + "VPMINUWZrr", + "(V?)PMINUWrr", + "VPROLDZ128ri", + "VPROLDZ256ri", + "VPROLDZri", + "VPROLQZ128ri", + "VPROLQZ256ri", + "VPROLQZri", + "VPROLVDZ128rr", + "VPROLVDZ256rr", + "VPROLVDZrr", + "VPROLVQZ128rr", + "VPROLVQZ256rr", + "VPROLVQZrr", + "VPRORDZ128ri", + "VPRORDZ256ri", + "VPRORDZri", + "VPRORQZ128ri", + "VPRORQZ256ri", + "VPRORQZri", + "VPRORVDZ128rr", + "VPRORVDZ256rr", + "VPRORVDZrr", + "VPRORVQZ128rr", + "VPRORVQZ256rr", + "VPRORVQZrr", + "(V?)PSIGNB(Y?)rr", + "(V?)PSIGND(Y?)rr", + "(V?)PSIGNW(Y?)rr", + "(V?)PSLLDYri", + "VPSLLDZ128ri", + "VPSLLDZ256ri", + "VPSLLDZri", + "(V?)PSLLDri", + "VPSLLQYri", + "VPSLLQZ128ri", + "VPSLLQZ256ri", + "VPSLLQZri", + "(V?)PSLLQri", + "VPSLLVDYrr", + "VPSLLVDZ128rr", + "VPSLLVDZ256rr", + "VPSLLVDZrr", + "VPSLLVDrr", + "VPSLLVQYrr", + "VPSLLVQZ128rr", + "VPSLLVQZ256rr", + "VPSLLVQZrr", + "VPSLLVQrr", + "VPSLLVWZ128rr", + "VPSLLVWZ256rr", + "VPSLLVWZrr", + "VPSLLWYri", + "VPSLLWZ128ri", + "VPSLLWZ256ri", + "VPSLLWZri", + "(V?)PSLLWri", + "VPSRADYri", + "VPSRADZ128ri", + "VPSRADZ256ri", + "VPSRADZri", + "(V?)PSRADri", + "VPSRAQZ128ri", + "VPSRAQZ256ri", + "VPSRAQZri", + "VPSRAVDYrr", + "VPSRAVDZ128rr", + "VPSRAVDZ256rr", + "VPSRAVDZrr", + "VPSRAVDrr", + "VPSRAVQZ128rr", + "VPSRAVQZ256rr", + "VPSRAVQZrr", + "VPSRAVWZ128rr", + "VPSRAVWZ256rr", + "VPSRAVWZrr", + "VPSRAWYri", + "VPSRAWZ128ri", + "VPSRAWZ256ri", + "VPSRAWZri", + "(V?)PSRAWri", + "VPSRLDYri", + "VPSRLDZ128ri", + "VPSRLDZ256ri", + "VPSRLDZri", + "(V?)PSRLDri", + "VPSRLQYri", + "VPSRLQZ128ri", + "VPSRLQZ256ri", + "VPSRLQZri", + "(V?)PSRLQri", + "VPSRLVDYrr", + "VPSRLVDZ128rr", + "VPSRLVDZ256rr", + "VPSRLVDZrr", + "VPSRLVDrr", + "VPSRLVQYrr", + "VPSRLVQZ128rr", + "VPSRLVQZ256rr", + "VPSRLVQZrr", + "VPSRLVQrr", + "VPSRLVWZ128rr", + "VPSRLVWZ256rr", + "VPSRLVWZrr", + "VPSRLWYri", + "VPSRLWZ128ri", + "VPSRLWZ256ri", + "VPSRLWZri", + "(V?)PSRLWri", + "VPSUBSBYrr", + "VPSUBSBZ128rr", + "VPSUBSBZ256rr", + "VPSUBSBZrr", + "(V?)PSUBSBrr", + "VPSUBSWYrr", + "VPSUBSWZ128rr", + "VPSUBSWZ256rr", + "VPSUBSWZrr", + "(V?)PSUBSWrr", + "VPSUBUSBYrr", + "VPSUBUSBZ128rr", + "VPSUBUSBZ256rr", + "VPSUBUSBZrr", + "(V?)PSUBUSBrr", + "VPSUBUSWYrr", + "VPSUBUSWZ128rr", + "VPSUBUSWZ256rr", + "VPSUBUSWZrr", + "(V?)PSUBUSWrr")>; + def SKXWriteResGroup6 : SchedWriteRes<[SKXPort05]> { let Latency = 1; let NumMicroOps = 1; @@ -818,6 +1042,28 @@ "VPMASKMOVQYmr", "VPMASKMOVQmr")>; +def SKXWriteResGroup19 : SchedWriteRes<[SKXPort5,SKXPort01]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLDZ128rr", + "(V?)PSLLDrr", + "VPSLLQZ128rr", + "(V?)PSLLQrr", + "VPSLLWZ128rr", + "(V?)PSLLWrr", + "VPSRADZ128rr", + "(V?)PSRADrr", + "VPSRAQZ128rr", + "VPSRAWZ128rr", + "(V?)PSRAWrr", + "VPSRLDZ128rr", + "(V?)PSRLDrr", + "VPSRLQZ128rr", + "(V?)PSRLQrr", + "(V?)PSRLWrr")>; + def SKXWriteResGroup20 : SchedWriteRes<[SKXPort6,SKXPort0156]> { let Latency = 2; let NumMicroOps = 2; @@ -963,7 +1209,9 @@ "VCMPPSZrri", "VCMPSDZrr", "VCMPSSZrr", - "VDBPSADBWZrri", // TODO: 512-bit ops require ports 0/1 to be joined. + "VDBPSADBWZ128rri", + "VDBPSADBWZ256rri", + "VDBPSADBWZrri", "VFPCLASSPDZ128rr", "VFPCLASSPDZ256rr", "VFPCLASSPDZrr", @@ -1036,18 +1284,57 @@ "VPMINUQZ128rr", "VPMINUQZ256rr", "VPMINUQZrr", + "VPMOVQDZ128rr", + "VPMOVQDZ256rr", + "VPMOVQDZrr", "VPMOVSXBDYrr", + "VPMOVSXBDZ128rr", + "VPMOVSXBDZ256rr", + "VPMOVSXBDZrr", "VPMOVSXBQYrr", + "VPMOVSXBQZ128rr", + "VPMOVSXBQZ256rr", + "VPMOVSXBQZrr", "VPMOVSXBWYrr", + "VPMOVSXBWZ128rr", + "VPMOVSXBWZ256rr", + "VPMOVSXBWZrr", "VPMOVSXDQYrr", + "VPMOVSXDQZ128rr", + "VPMOVSXDQZ256rr", + "VPMOVSXDQZrr", "VPMOVSXWDYrr", + "VPMOVSXWDZ128rr", + "VPMOVSXWDZ256rr", + "VPMOVSXWDZrr", "VPMOVSXWQYrr", + "VPMOVSXWQZ128rr", + "VPMOVSXWQZ256rr", + "VPMOVSXWQZrr", "VPMOVZXBDYrr", + "VPMOVZXBDZ128rr", + "VPMOVZXBDZ256rr", + "VPMOVZXBDZrr", "VPMOVZXBQYrr", + "VPMOVZXBQZ128rr", + "VPMOVZXBQZ256rr", + "VPMOVZXBQZrr", "VPMOVZXBWYrr", + "VPMOVZXBWZ128rr", + "VPMOVZXBWZ256rr", + "VPMOVZXBWZrr", "VPMOVZXDQYrr", + "VPMOVZXDQZ128rr", + "VPMOVZXDQZ256rr", + "VPMOVZXDQZrr", "VPMOVZXWDYrr", + "VPMOVZXWDZ128rr", + "VPMOVZXWDZ256rr", + "VPMOVZXWDZrr", "VPMOVZXWQYrr", + "VPMOVZXWQZ128rr", + "VPMOVZXWQZ256rr", + "VPMOVZXWQZrr", "VPSADBWZrr", // TODO: 512-bit ops require ports 0/1 to be joined. "VPTESTMBZ128rr", "VPTESTMBZ256rr", @@ -1129,6 +1416,13 @@ } def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PH(ADD|SUB)(D|W)rr")>; +def SKXWriteResGroup40 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup40], (instregex "(V?)PH(ADD|SUB)(D|W)(Y?)rr")>; + def SKXWriteResGroup41 : SchedWriteRes<[SKXPort5,SKXPort0156]> { let Latency = 3; let NumMicroOps = 3; @@ -1252,7 +1546,53 @@ "VCVTUDQ2PSZrr", "VCVTUQQ2PDZ128rr", "VCVTUQQ2PDZ256rr", - "VCVTUQQ2PDZrr")>; + "VCVTUQQ2PDZrr", + "VPLZCNTDZ128rr", + "VPLZCNTDZ256rr", + "VPLZCNTDZrr", + "VPLZCNTQZ128rr", + "VPLZCNTQZ256rr", + "VPLZCNTQZrr", + "VPMADDUBSWYrr", + "VPMADDUBSWZ128rr", + "VPMADDUBSWZ256rr", + "VPMADDUBSWZrr", + "(V?)PMADDUBSWrr", + "VPMADDWDYrr", + "VPMADDWDZ128rr", + "VPMADDWDZ256rr", + "VPMADDWDZrr", + "(V?)PMADDWDrr", + "VPMULDQYrr", + "VPMULDQZ128rr", + "VPMULDQZ256rr", + "VPMULDQZrr", + "(V?)PMULDQrr", + "VPMULHRSWYrr", + "VPMULHRSWZ128rr", + "VPMULHRSWZ256rr", + "VPMULHRSWZrr", + "(V?)PMULHRSWrr", + "VPMULHUWYrr", + "VPMULHUWZ128rr", + "VPMULHUWZ256rr", + "VPMULHUWZrr", + "(V?)PMULHUWrr", + "VPMULHWYrr", + "VPMULHWZ128rr", + "VPMULHWZ256rr", + "VPMULHWZrr", + "(V?)PMULHWrr", + "VPMULLWYrr", + "VPMULLWZ128rr", + "VPMULLWZ256rr", + "VPMULLWZrr", + "(V?)PMULLWrr", + "VPMULUDQYrr", + "VPMULUDQZ128rr", + "VPMULUDQZ256rr", + "VPMULUDQZrr", + "(V?)PMULUDQrr")>; def SKXWriteResGroup51 : SchedWriteRes<[SKXPort5]> { let Latency = 4; @@ -1337,6 +1677,38 @@ } def: InstRW<[SKXWriteResGroup52_16], (instrs IMUL16r, MUL16r)>; +def SKXWriteResGroup53 : SchedWriteRes<[SKXPort5,SKXPort01]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDYrr", + "VPSLLDZ256rr", + "VPSLLDZrr", + "VPSLLQYrr", + "VPSLLQZ256rr", + "VPSLLQZrr", + "VPSLLWYrr", + "VPSLLWZ256rr", + "VPSLLWZrr", + "VPSRADYrr", + "VPSRADZ256rr", + "VPSRADZrr", + "VPSRAQZ256rr", + "VPSRAQZrr", + "VPSRAWYrr", + "VPSRAWZ256rr", + "VPSRAWZrr", + "VPSRLDYrr", + "VPSRLDZ256rr", + "VPSRLDZrr", + "VPSRLQYrr", + "VPSRLQZ256rr", + "VPSRLQZrr", + "VPSRLWYrr", + "VPSRLWZ256rr", + "VPSRLWZrr")>; + def SKXWriteResGroup54 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> { let Latency = 4; let NumMicroOps = 3; @@ -1817,7 +2189,9 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSDZrm(b?)", +def: InstRW<[SKXWriteResGroup92], (instregex "VINSERTPSZrm(b?)", + "(V?)INSERTPSrm", + "VMOVSDZrm(b?)", "VMOVSSZrm(b?)", "VPACKSSDWZ128rm(b?)", "(V?)PACKSSDWrm", @@ -1833,6 +2207,16 @@ "VPBROADCASTBrm", "VPBROADCASTWZ128m(b?)", "VPBROADCASTWrm", + "VPERMILPDZ128m(b?)i", + "VPERMILPDZ128rm(b?)", + "VPERMILPDmi", + "VPERMILPDrm", + "VPERMILPSZ128m(b?)i", + "VPERMILPSZ128rm(b?)", + "VPERMILPSmi", + "VPERMILPSrm", + "VPSHUFBZ128rm(b?)", + "(V?)PSHUFBrm", "VPSHUFDZ128m(b?)i", "(V?)PSHUFDmi", "VPSHUFHWZ128mi(b?)", @@ -1856,14 +2240,19 @@ "VPUNPCKLQDQZ128rm(b?)", "(V?)PUNPCKLQDQrm", "VPUNPCKLWDZ128rm(b?)", - "(V?)PUNPCKLWDrm")>; - -def SKXWriteResGroup92a : SchedWriteRes<[SKXPort5,SKXPort23]> { - let Latency = 6; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SKXWriteResGroup92a], (instregex "MMX_PSHUFBrm")>; + "(V?)PUNPCKLWDrm", + "VSHUFPDZ128rm(b?)i", + "(V?)SHUFPDrmi", + "VSHUFPSZ128rm(b?)i", + "(V?)SHUFPSrmi", + "VUNPCKHPDZ128rm(b?)", + "(V?)UNPCKHPDrm", + "VUNPCKHPSZ128rm(b?)", + "(V?)UNPCKHPSrm", + "VUNPCKLPDZ128rm(b?)", + "(V?)UNPCKLPDrm", + "VUNPCKLPSZ128rm(b?)", + "(V?)UNPCKLPSrm")>; def SKXWriteResGroup93 : SchedWriteRes<[SKXPort5,SKXPort015]> { let Latency = 7; @@ -1909,6 +2298,120 @@ "VCVTUQQ2PSZ256rr", "VCVTUQQ2PSZrr")>; +def SKXWriteResGroup94 : SchedWriteRes<[SKXPort01,SKXPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup94], (instregex "VPABSBZ128rm(b?)", + "(V?)PABSBrm", + "VPABSDZ128rm(b?)", + "(V?)PABSDrm", + "VPABSQZ128rm(b?)", + "VPABSWZ128rm(b?)", + "(V?)PABSWrm", + "VPADDSBZ128rm(b?)", + "(V?)PADDSBrm", + "VPADDSWZ128rm(b?)", + "(V?)PADDSWrm", + "VPADDUSBZ128rm(b?)", + "(V?)PADDUSBrm", + "VPADDUSWZ128rm(b?)", + "(V?)PADDUSWrm", + "VPAVGBZ128rm(b?)", + "(V?)PAVGBrm", + "VPAVGWZ128rm(b?)", + "(V?)PAVGWrm", + "(V?)PCMPEQBrm", + "(V?)PCMPEQDrm", + "(V?)PCMPEQQrm", + "(V?)PCMPEQWrm", + "(V?)PCMPGTBrm", + "(V?)PCMPGTDrm", + "(V?)PCMPGTWrm", + "VPMAXSBZ128rm(b?)", + "(V?)PMAXSBrm", + "VPMAXSDZ128rm(b?)", + "(V?)PMAXSDrm", + "VPMAXSWZ128rm(b?)", + "(V?)PMAXSWrm", + "VPMAXUBZ128rm(b?)", + "(V?)PMAXUBrm", + "VPMAXUDZ128rm(b?)", + "(V?)PMAXUDrm", + "VPMAXUWZ128rm(b?)", + "(V?)PMAXUWrm", + "VPMINSBZ128rm(b?)", + "(V?)PMINSBrm", + "VPMINSDZ128rm(b?)", + "(V?)PMINSDrm", + "VPMINSWZ128rm(b?)", + "(V?)PMINSWrm", + "VPMINUBZ128rm(b?)", + "(V?)PMINUBrm", + "VPMINUDZ128rm(b?)", + "(V?)PMINUDrm", + "VPMINUWZ128rm(b?)", + "(V?)PMINUWrm", + "VPROLDZ128m(b?)i", + "VPROLQZ128m(b?)i", + "VPROLVDZ128rm(b?)", + "VPROLVQZ128rm(b?)", + "VPRORDZ128m(b?)i", + "VPRORQZ128m(b?)i", + "VPRORVDZ128rm(b?)", + "VPRORVQZ128rm(b?)", + "(V?)PSIGNBrm", + "(V?)PSIGNDrm", + "(V?)PSIGNWrm", + "VPSLLDZ128m(b?)i", + "VPSLLDZ128rm(b?)", + "(V?)PSLLDrm", + "VPSLLQZ128m(b?)i", + "VPSLLQZ128rm(b?)", + "(V?)PSLLQrm", + "VPSLLVDZ128rm(b?)", + "VPSLLVDrm", + "VPSLLVQZ128rm(b?)", + "VPSLLVQrm", + "VPSLLVWZ128rm(b?)", + "VPSLLWZ128mi(b?)", + "VPSLLWZ128rm(b?)", + "(V?)PSLLWrm", + "VPSRADZ128m(b?)i", + "VPSRADZ128rm(b?)", + "(V?)PSRADrm", + "VPSRAQZ128m(b?)i", + "VPSRAQZ128rm(b?)", + "VPSRAVDZ128rm(b?)", + "VPSRAVDrm", + "VPSRAVQZ128rm(b?)", + "VPSRAVWZ128rm(b?)", + "VPSRAWZ128mi(b?)", + "VPSRAWZ128rm(b?)", + "(V?)PSRAWrm", + "VPSRLDZ128m(b?)i", + "VPSRLDZ128rm(b?)", + "(V?)PSRLDrm", + "VPSRLQZ128m(b?)i", + "VPSRLQZ128rm(b?)", + "(V?)PSRLQrm", + "VPSRLVDZ128rm(b?)", + "VPSRLVDrm", + "VPSRLVQZ128rm(b?)", + "VPSRLVQrm", + "VPSRLVWZ128rm(b?)", + "VPSRLWZ128mi(b?)", + "VPSRLWZ128rm(b?)", + "(V?)PSRLWrm", + "VPSUBSBZ128rm(b?)", + "(V?)PSUBSBrm", + "VPSUBSWZ128rm(b?)", + "(V?)PSUBSWrm", + "VPSUBUSBZ128rm(b?)", + "(V?)PSUBUSBrm", + "VPSUBUSWZ128rm(b?)", + "(V?)PSUBUSWrm")>; def SKXWriteResGroup95 : SchedWriteRes<[SKXPort23,SKXPort015]> { let Latency = 7; @@ -2186,15 +2689,253 @@ "FCOMP64m", "MMX_PSADBWirm", "VFPCLASSSDrm(b?)", + "VPACKSSDWYrm", + "VPACKSSDWZ256rm(b?)", + "VPACKSSDWZrm(b?)", + "VPACKSSWBYrm", + "VPACKSSWBZ256rm(b?)", + "VPACKSSWBZrm(b?)", + "VPACKUSDWYrm", + "VPACKUSDWZ256rm(b?)", + "VPACKUSDWZrm(b?)", + "VPACKUSWBYrm", + "VPACKUSWBZ256rm(b?)", + "VPACKUSWBZrm(b?)", + "VPALIGNRYrmi", + "VPALIGNRZ256rmi(b?)", + "VPALIGNRZrmi(b?)", + "VPBLENDWYrmi", "VPBROADCASTBYrm", "VPBROADCASTBZ256m(b?)", "VPBROADCASTBZm(b?)", "VPBROADCASTWYrm", "VPBROADCASTWZ256m(b?)", "VPBROADCASTWZm(b?)", + "VPERMILPDYrm", + "VPERMILPDZ256rm(b?)", + "VPERMILPDZrm(b?)", + "VPERMILPSYrm", + "VPERMILPSZ256rm(b?)", + "VPERMILPSZrm(b?)", "VPMOVSXBDYrm", "VPMOVSXBQYrm", - "VPMOVSXWQYrm")>; + "VPMOVSXWQYrm", + "VPSHUFBYrm", + "VPSHUFBZ256rm(b?)", + "VPSHUFBZrm(b?)", + "VPSHUFDYmi", + "VPSHUFDZ256m(b?)i", + "VPSHUFDZm(b?)i", + "VPSHUFHWYmi", + "VPSHUFHWZ256mi(b?)", + "VPSHUFHWZmi(b?)", + "VPSHUFLWYmi", + "VPSHUFLWZ256mi(b?)", + "VPSHUFLWZmi(b?)", + "VPSLLDQZ256rm(b?)", + "VPSLLDQZrm(b?)", + "VPSRLDQZ256rm(b?)", + "VPSRLDQZrm(b?)", + "VPUNPCKHBWYrm", + "VPUNPCKHBWZ256rm(b?)", + "VPUNPCKHBWZrm(b?)", + "VPUNPCKHDQYrm", + "VPUNPCKHDQZ256rm(b?)", + "VPUNPCKHDQZrm(b?)", + "VPUNPCKHQDQYrm", + "VPUNPCKHQDQZ256rm(b?)", + "VPUNPCKHQDQZrm(b?)", + "VPUNPCKHWDYrm", + "VPUNPCKHWDZ256rm(b?)", + "VPUNPCKHWDZrm(b?)", + "VPUNPCKLBWYrm", + "VPUNPCKLBWZ256rm(b?)", + "VPUNPCKLBWZrm(b?)", + "VPUNPCKLDQYrm", + "VPUNPCKLDQZ256rm(b?)", + "VPUNPCKLDQZrm(b?)", + "VPUNPCKLQDQYrm", + "VPUNPCKLQDQZ256rm(b?)", + "VPUNPCKLQDQZrm(b?)", + "VPUNPCKLWDYrm", + "VPUNPCKLWDZ256rm(b?)", + "VPUNPCKLWDZrm(b?)")>; + +def SKXWriteResGroup120 : SchedWriteRes<[SKXPort01,SKXPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup120], (instregex "VPABSBYrm", + "VPABSBZ256rm(b?)", + "VPABSBZrm(b?)", + "VPABSDYrm", + "VPABSDZ256rm(b?)", + "VPABSDZrm(b?)", + "VPABSQZ256rm(b?)", + "VPABSQZrm(b?)", + "VPABSWYrm", + "VPABSWZ256rm(b?)", + "VPABSWZrm(b?)", + "VPADDSBYrm", + "VPADDSBZ256rm(b?)", + "VPADDSBZrm(b?)", + "VPADDSWYrm", + "VPADDSWZ256rm(b?)", + "VPADDSWZrm(b?)", + "VPADDUSBYrm", + "VPADDUSBZ256rm(b?)", + "VPADDUSBZrm(b?)", + "VPADDUSWYrm", + "VPADDUSWZ256rm(b?)", + "VPADDUSWZrm(b?)", + "VPAVGBYrm", + "VPAVGBZ256rm(b?)", + "VPAVGBZrm(b?)", + "VPAVGWYrm", + "VPAVGWZ256rm(b?)", + "VPAVGWZrm(b?)", + "VPCMPEQBYrm", + "VPCMPEQDYrm", + "VPCMPEQQYrm", + "VPCMPEQWYrm", + "VPCMPGTBYrm", + "VPCMPGTDYrm", + "VPCMPGTWYrm", + "VPMAXSBYrm", + "VPMAXSBZ256rm(b?)", + "VPMAXSBZrm(b?)", + "VPMAXSDYrm", + "VPMAXSDZ256rm(b?)", + "VPMAXSDZrm(b?)", + "VPMAXSWYrm", + "VPMAXSWZ256rm(b?)", + "VPMAXSWZrm(b?)", + "VPMAXUBYrm", + "VPMAXUBZ256rm(b?)", + "VPMAXUBZrm(b?)", + "VPMAXUDYrm", + "VPMAXUDZ256rm(b?)", + "VPMAXUDZrm(b?)", + "VPMAXUWYrm", + "VPMAXUWZ256rm(b?)", + "VPMAXUWZrm(b?)", + "VPMINSBYrm", + "VPMINSBZ256rm(b?)", + "VPMINSBZrm(b?)", + "VPMINSDYrm", + "VPMINSDZ256rm(b?)", + "VPMINSDZrm(b?)", + "VPMINSWYrm", + "VPMINSWZ256rm(b?)", + "VPMINSWZrm(b?)", + "VPMINUBYrm", + "VPMINUBZ256rm(b?)", + "VPMINUBZrm(b?)", + "VPMINUDYrm", + "VPMINUDZ256rm(b?)", + "VPMINUDZrm(b?)", + "VPMINUWYrm", + "VPMINUWZ256rm(b?)", + "VPMINUWZrm(b?)", + "VPROLDZ256m(b?)i", + "VPROLDZm(b?)i", + "VPROLQZ256m(b?)i", + "VPROLQZm(b?)i", + "VPROLVDZ256rm(b?)", + "VPROLVDZrm(b?)", + "VPROLVQZ256rm(b?)", + "VPROLVQZrm(b?)", + "VPRORDZ256m(b?)i", + "VPRORDZm(b?)i", + "VPRORQZ256m(b?)i", + "VPRORQZm(b?)i", + "VPRORVDZ256rm(b?)", + "VPRORVDZrm(b?)", + "VPRORVQZ256rm(b?)", + "VPRORVQZrm(b?)", + "VPSIGNBYrm", + "VPSIGNDYrm", + "VPSIGNWYrm", + "VPSLLDYrm", + "VPSLLDZ256m(b?)i", + "VPSLLDZ256rm(b?)", + "VPSLLDZm(b?)i", + "VPSLLDZrm(b?)", + "VPSLLQYrm", + "VPSLLQZ256m(b?)i", + "VPSLLQZ256rm(b?)", + "VPSLLQZm(b?)i", + "VPSLLQZrm(b?)", + "VPSLLVDYrm", + "VPSLLVDZ256rm(b?)", + "VPSLLVDZrm(b?)", + "VPSLLVQYrm", + "VPSLLVQZ256rm(b?)", + "VPSLLVQZrm(b?)", + "VPSLLVWZ256rm(b?)", + "VPSLLVWZrm(b?)", + "VPSLLWYrm", + "VPSLLWZ256mi(b?)", + "VPSLLWZ256rm(b?)", + "VPSLLWZmi(b?)", + "VPSLLWZrm(b?)", + "VPSRADYrm", + "VPSRADZ256m(b?)i", + "VPSRADZ256rm(b?)", + "VPSRADZm(b?)i", + "VPSRADZrm(b?)", + "VPSRAQZ256m(b?)i", + "VPSRAQZ256rm(b?)", + "VPSRAQZm(b?)i", + "VPSRAQZrm(b?)", + "VPSRAVDYrm", + "VPSRAVDZ256rm(b?)", + "VPSRAVDZrm(b?)", + "VPSRAVQZ256rm(b?)", + "VPSRAVQZrm(b?)", + "VPSRAVWZ256rm(b?)", + "VPSRAVWZrm(b?)", + "VPSRAWYrm", + "VPSRAWZ256mi(b?)", + "VPSRAWZ256rm(b?)", + "VPSRAWZmi(b?)", + "VPSRAWZrm(b?)", + "VPSRLDYrm", + "VPSRLDZ256m(b?)i", + "VPSRLDZ256rm(b?)", + "VPSRLDZm(b?)i", + "VPSRLDZrm(b?)", + "VPSRLQYrm", + "VPSRLQZ256m(b?)i", + "VPSRLQZ256rm(b?)", + "VPSRLQZm(b?)i", + "VPSRLQZrm(b?)", + "VPSRLVDYrm", + "VPSRLVDZ256rm(b?)", + "VPSRLVDZrm(b?)", + "VPSRLVQYrm", + "VPSRLVQZ256rm(b?)", + "VPSRLVQZrm(b?)", + "VPSRLVWZ256rm(b?)", + "VPSRLVWZrm(b?)", + "VPSRLWYrm", + "VPSRLWZ256mi(b?)", + "VPSRLWZ256rm(b?)", + "VPSRLWZmi(b?)", + "VPSRLWZrm(b?)", + "VPSUBSBYrm", + "VPSUBSBZ256rm(b?)", + "VPSUBSBZrm(b?)", + "VPSUBSWYrm", + "VPSUBSWZ256rm(b?)", + "VPSUBSWZrm(b?)", + "VPSUBUSBYrm", + "VPSUBUSBZ256rm(b?)", + "VPSUBUSBZrm(b?)", + "VPSUBUSWYrm", + "VPSUBUSWZ256rm(b?)", + "VPSUBUSWZrm(b?)")>; def SKXWriteResGroup121 : SchedWriteRes<[SKXPort23,SKXPort015]> { let Latency = 8; @@ -2432,6 +3173,7 @@ "VCMPPSZ128rm(b?)i", "VCMPSDZrm", "VCMPSSZrm", + "VDBPSADBWZ128rmi(b?)", "VFPCLASSSSrm(b?)", "VPCMPBZ128rmi(b?)", "VPCMPDZ128rmi(b?)", @@ -2478,6 +3220,7 @@ "VPMOVZXWDYrm", "VPMOVZXWDZ128rm(b?)", "VPMOVZXWQZ128rm(b?)", + "VPSADBWZ128rm(b?)", "VPTESTMBZ128rm(b?)", "VPTESTMDZ128rm(b?)", "VPTESTMQZ128rm(b?)", @@ -2519,6 +3262,13 @@ "VRSQRT14PDZr(b?)", "VRSQRT14PSZr(b?)")>; +def SKXWriteResGroup139 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup139], (instregex "(V?)DPPDrri")>; + def SKXWriteResGroup141 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { let Latency = 9; let NumMicroOps = 3; @@ -2541,6 +3291,16 @@ def: InstRW<[SKXWriteResGroup143], (instregex "(V?)PHADDSWrm", "(V?)PHSUBSWrm")>; +def SKXWriteResGroup144 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup144], (instregex "(V?)PHADDDrm", + "(V?)PHADDWrm", + "(V?)PHSUBDrm", + "(V?)PHSUBWrm")>; + def SKXWriteResGroup145 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort0156]> { let Latency = 9; let NumMicroOps = 4; @@ -2572,6 +3332,8 @@ "VCMPPDZrm(b?)i", "VCMPPSZ256rm(b?)i", "VCMPPSZrm(b?)i", + "VDBPSADBWZ256rmi(b?)", + "VDBPSADBWZrmi(b?)", "VPCMPBZ256rmi(b?)", "VPCMPBZrmi(b?)", "VPCMPDZ256rmi(b?)", @@ -2605,6 +3367,40 @@ "VPCMPUWZrmi(b?)", "VPCMPWZ256rmi(b?)", "VPCMPWZrmi(b?)", + "VPERM2F128rm", + "VPERM2I128rm", + "VPERMDYrm", + "VPERMDZ256rm(b?)", + "VPERMDZrm(b?)", + "VPERMI2D256rm(b?)", + "VPERMI2Drm(b?)", + "VPERMI2PD256rm(b?)", + "VPERMI2PDrm(b?)", + "VPERMI2PS256rm(b?)", + "VPERMI2PSrm(b?)", + "VPERMI2Q256rm(b?)", + "VPERMI2Qrm(b?)", + "VPERMPDYmi", + "VPERMPDZ256m(b?)i", + "VPERMPDZ256rm(b?)", + "VPERMPDZm(b?)i", + "VPERMPDZrm(b?)", + "VPERMPSYrm", + "VPERMPSZ256rm(b?)", + "VPERMPSZrm(b?)", + "VPERMQYmi", + "VPERMQZ256m(b?)i", + "VPERMQZ256rm(b?)", + "VPERMQZm(b?)i", + "VPERMQZrm(b?)", + "VPERMT2D256rm(b?)", + "VPERMT2Drm(b?)", + "VPERMT2PD256rm(b?)", + "VPERMT2PDrm(b?)", + "VPERMT2PS256rm(b?)", + "VPERMT2PSrm(b?)", + "VPERMT2Q256rm(b?)", + "VPERMT2Qrm(b?)", "VPMAXSQZ256rm(b?)", "VPMAXSQZrm(b?)", "VPMAXUQZ256rm(b?)", @@ -2613,11 +3409,38 @@ "VPMINSQZrm(b?)", "VPMINUQZ256rm(b?)", "VPMINUQZrm(b?)", + "VPMOVSXBDZ256rm(b?)", + "VPMOVSXBDZrm(b?)", + "VPMOVSXBQZ256rm(b?)", + "VPMOVSXBQZrm(b?)", + "VPMOVSXBWZ256rm(b?)", + "VPMOVSXBWZrm(b?)", + "VPMOVSXDQZ256rm(b?)", + "VPMOVSXDQZrm(b?)", + "VPMOVSXWDZ256rm(b?)", + "VPMOVSXWDZrm(b?)", + "VPMOVSXWQZ256rm(b?)", + "VPMOVSXWQZrm(b?)", "VPMOVZXBDYrm", + "VPMOVZXBDZ256rm(b?)", + "VPMOVZXBDZrm(b?)", "VPMOVZXBQYrm", + "VPMOVZXBQZ256rm(b?)", + "VPMOVZXBQZrm(b?)", "VPMOVZXBWYrm", + "VPMOVZXBWZ256rm(b?)", + "VPMOVZXBWZrm(b?)", "VPMOVZXDQYrm", + "VPMOVZXDQZ256rm(b?)", + "VPMOVZXDQZrm(b?)", + "VPMOVZXWDZ256rm(b?)", + "VPMOVZXWDZrm(b?)", "VPMOVZXWQYrm", + "VPMOVZXWQZ256rm(b?)", + "VPMOVZXWQZrm(b?)", + "VPSADBWYrm", + "VPSADBWZ256rm(b?)", + "VPSADBWZrm(b?)", "VPTESTMBZ256rm(b?)", "VPTESTMBZrm(b?)", "VPTESTMDZ256rm(b?)", @@ -2633,7 +3456,15 @@ "VPTESTNMQZ256rm(b?)", "VPTESTNMQZrm(b?)", "VPTESTNMWZ256rm(b?)", - "VPTESTNMWZrm(b?)")>; + "VPTESTNMWZrm(b?)", + "VSHUFF32X4Z256rm(b?)i", + "VSHUFF32X4Zrm(b?)i", + "VSHUFF64X2Z256rm(b?)i", + "VSHUFF64X2Zrm(b?)i", + "VSHUFI32X4Z256rm(b?)i", + "VSHUFI32X4Zrm(b?)i", + "VSHUFI64X2Z256rm(b?)i", + "VSHUFI64X2Zrm(b?)i")>; def SKXWriteResGroup149 : SchedWriteRes<[SKXPort23,SKXPort015]> { let Latency = 10; @@ -2644,6 +3475,14 @@ "CVTPS2DQrm", "CVTSS2SDrm", "CVTTPS2DQrm", + "PMADDUBSWrm", + "PMADDWDrm", + "PMULDQrm", + "PMULHRSWrm", + "PMULHUWrm", + "PMULHWrm", + "PMULLWrm", + "PMULUDQrm", "VCVTDQ2PDZ128rm(b?)", "VCVTDQ2PSZ128rm(b?)", "VCVTDQ2PSrm", @@ -2671,7 +3510,25 @@ "VCVTUDQ2PDZ128rm(b?)", "VCVTUDQ2PSZ128rm(b?)", "VCVTUQQ2PDZ128rm(b?)", - "VCVTUQQ2PSZ128rm(b?)")>; + "VCVTUQQ2PSZ128rm(b?)", + "VPLZCNTDZ128rm(b?)", + "VPLZCNTQZ128rm(b?)", + "VPMADDUBSWZ128rm(b?)", + "VPMADDUBSWrm", + "VPMADDWDZ128rm(b?)", + "VPMADDWDrm", + "VPMULDQZ128rm(b?)", + "VPMULDQrm", + "VPMULHRSWZ128rm(b?)", + "VPMULHRSWrm", + "VPMULHUWZ128rm(b?)", + "VPMULHUWrm", + "VPMULHWZ128rm(b?)", + "VPMULHWrm", + "VPMULLWZ128rm(b?)", + "VPMULLWrm", + "VPMULUDQZ128rm(b?)", + "VPMULUDQrm")>; def SKXWriteResGroup151 : SchedWriteRes<[SKXPort5,SKXPort23]> { let Latency = 10; @@ -2706,6 +3563,16 @@ def: InstRW<[SKXWriteResGroup154], (instregex "VPHADDSWYrm", "VPHSUBSWYrm")>; +def SKXWriteResGroup155 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup155], (instregex "VPHADDDYrm", + "VPHADDWYrm", + "VPHSUBDYrm", + "VPHSUBWYrm")>; + def SKXWriteResGroup156 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort06,SKXPort0156]> { let Latency = 9; let NumMicroOps = 4; @@ -2795,7 +3662,35 @@ "VCVTUDQ2PSZrm(b?)", "VCVTUQQ2PDZ256rm(b?)", "VCVTUQQ2PDZrm(b?)", - "VCVTUQQ2PSZ256rm(b?)")>; + "VCVTUQQ2PSZ256rm(b?)", + "VPLZCNTDZ256rm(b?)", + "VPLZCNTDZrm(b?)", + "VPLZCNTQZ256rm(b?)", + "VPLZCNTQZrm(b?)", + "VPMADDUBSWYrm", + "VPMADDUBSWZ256rm(b?)", + "VPMADDUBSWZrm(b?)", + "VPMADDWDYrm", + "VPMADDWDZ256rm(b?)", + "VPMADDWDZrm(b?)", + "VPMULDQYrm", + "VPMULDQZ256rm(b?)", + "VPMULDQZrm(b?)", + "VPMULHRSWYrm", + "VPMULHRSWZ256rm(b?)", + "VPMULHRSWZrm(b?)", + "VPMULHUWYrm", + "VPMULHUWZ256rm(b?)", + "VPMULHUWZrm(b?)", + "VPMULHWYrm", + "VPMULHWZ256rm(b?)", + "VPMULHWZrm(b?)", + "VPMULLWYrm", + "VPMULLWZ256rm(b?)", + "VPMULLWZrm(b?)", + "VPMULUDQYrm", + "VPMULUDQZ256rm(b?)", + "VPMULUDQZrm(b?)")>; def SKXWriteResGroup162 : SchedWriteRes<[SKXPort5,SKXPort23]> { let Latency = 11; @@ -2810,6 +3705,7 @@ "VEXPANDPDZrm(b?)", "VEXPANDPSZ256rm(b?)", "VEXPANDPSZrm(b?)", + "VMPSADBWYrmi", "VPEXPANDDZ256rm(b?)", "VPEXPANDDZrm(b?)", "VPEXPANDQZ256rm(b?)", @@ -2984,6 +3880,15 @@ } def: InstRW<[SKXWriteResGroup181], (instregex "VCVTDQ2PDYrm")>; +def SKXWriteResGroup182 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKXWriteResGroup182], (instregex "DPPSrri", + "VDPPSYrri", + "VDPPSrri")>; + def SKXWriteResGroup183 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { let Latency = 13; let NumMicroOps = 4; @@ -3079,6 +3984,22 @@ "VROUNDPDYm", "VROUNDPSYm")>; +def SKXWriteResGroup192_2 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 17; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup192_2], (instregex "VPMULLDYrm", + "VPMULLDZ256rm(b?)", + "VPMULLDZrm(b?)")>; + +def SKXWriteResGroup193 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 15; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKXWriteResGroup193], (instregex "(V?)DPPDrmi")>; + def SKXWriteResGroup194 : SchedWriteRes<[SKXPort1,SKXPort5,SKXPort01,SKXPort23,SKXPort015]> { let Latency = 15; let NumMicroOps = 8; @@ -3224,6 +4145,13 @@ def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)", "VPMULLQZrm(b?)")>; +def SKXWriteResGroup212 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 19; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKXWriteResGroup212], (instregex "(V?)DPPSrmi")>; + def SKXWriteResGroup214 : SchedWriteRes<[]> { let Latency = 20; let NumMicroOps = 0; @@ -3248,6 +4176,13 @@ } def: InstRW<[SKXWriteResGroup216], (instregex "(V?)DIVPD(Z128)?rm")>; +def SKXWriteResGroup217 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 20; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKXWriteResGroup217], (instregex "VDPPSYrmi")>; + def SKXWriteResGroup218 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { let Latency = 20; let NumMicroOps = 5; Index: lib/Target/X86/X86Schedule.td =================================================================== --- lib/Target/X86/X86Schedule.td +++ lib/Target/X86/X86Schedule.td @@ -19,17 +19,6 @@ // load + WriteRMW. def WriteRMW : SchedWrite; -// Helper to set SchedWrite ExePorts/Latency/ResourceCycles/NumMicroOps. -multiclass X86WriteRes ExePorts, - int Lat, list Res, int UOps> { - def : WriteRes { - let Latency = Lat; - let ResourceCycles = Res; - let NumMicroOps = UOps; - } -} - // Most instructions can fold loads, so almost every SchedWrite comes in two // variants: With and without a folded load. // An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite @@ -119,9 +108,6 @@ defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. defm WriteFMAS : X86SchedWritePair; // Fused Multiply Add (Scalar). defm WriteFMAY : X86SchedWritePair; // Fused Multiply Add (YMM/ZMM). -defm WriteDPPD : X86SchedWritePair; // Floating point double dot product. -defm WriteDPPS : X86SchedWritePair; // Floating point single dot product. -defm WriteDPPSY : X86SchedWritePair; // Floating point single dot product (YMM). defm WriteFSign : X86SchedWritePair; // Floating point fabs/fchs. defm WriteFLogic : X86SchedWritePair; // Floating point and/or/xor logicals. defm WriteFLogicY : X86SchedWritePair; // Floating point and/or/xor logicals (YMM/ZMM). @@ -141,37 +127,23 @@ defm WriteFHAdd : X86SchedWritePair; defm WriteFHAddY : X86SchedWritePair; // YMM/ZMM. defm WritePHAdd : X86SchedWritePair; -defm WritePHAddY : X86SchedWritePair; // YMM/ZMM. // Vector integer operations. def WriteVecLoad : SchedWrite; def WriteVecStore : SchedWrite; def WriteVecMove : SchedWrite; defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. -defm WriteVecALUY : X86SchedWritePair; // Vector integer ALU op, no logicals (YMM/ZMM). defm WriteVecLogic : X86SchedWritePair; // Vector integer and/or/xor logicals. defm WriteVecLogicY: X86SchedWritePair; // Vector integer and/or/xor logicals (YMM/ZMM). -defm WriteVecShift : X86SchedWritePair; // Vector integer shifts (default). -defm WriteVecShiftX : X86SchedWritePair; // Vector integer shifts (XMM). -defm WriteVecShiftY : X86SchedWritePair; // Vector integer shifts (YMM/ZMM). -defm WriteVecShiftImmX: X86SchedWritePair; // Vector integer immediate shifts (XMM). -defm WriteVecShiftImmY: X86SchedWritePair; // Vector integer immediate shifts (YMM/ZMM). +defm WriteVecShift : X86SchedWritePair; // Vector integer shifts. defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply. -defm WriteVecIMulY : X86SchedWritePair; // Vector integer multiply (YMM/ZMM). -defm WritePMULLD : X86SchedWritePair; // Vector PMULLD. -defm WritePMULLDY : X86SchedWritePair; // Vector PMULLD (YMM/ZMM). +defm WritePMULLD : X86SchedWritePair; // PMULLD defm WriteShuffle : X86SchedWritePair; // Vector shuffles. -defm WriteShuffleY : X86SchedWritePair; // Vector shuffles (YMM/ZMM). defm WriteVarShuffle : X86SchedWritePair; // Vector variable shuffles. -defm WriteVarShuffleY : X86SchedWritePair; // Vector variable shuffles (YMM/ZMM). defm WriteBlend : X86SchedWritePair; // Vector blends. -defm WriteBlendY : X86SchedWritePair; // Vector blends (YMM/ZMM). defm WriteVarBlend : X86SchedWritePair; // Vector variable blends. -defm WriteVarBlendY : X86SchedWritePair; // Vector variable blends (YMM/ZMM). -defm WritePSADBW : X86SchedWritePair; // Vector PSADBW. -defm WritePSADBWY : X86SchedWritePair; // Vector PSADBW (YMM/ZMM). -defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD. -defm WriteMPSADY : X86SchedWritePair; // Vector MPSAD (YMM/ZMM). +defm WritePSADBW : X86SchedWritePair; // Vector PSADBW. +defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD. defm WritePHMINPOS : X86SchedWritePair; // Vector PHMINPOS. // Vector insert/extract operations. @@ -223,8 +195,7 @@ defm WriteFVarShuffle256 : X86SchedWritePair; // Fp 256-bit width variable shuffles. defm WriteShuffle256 : X86SchedWritePair; // 256-bit width vector shuffles. defm WriteVarShuffle256 : X86SchedWritePair; // 256-bit width vector variable shuffles. -defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts. -defm WriteVarVecShiftY : X86SchedWritePair; // Variable vector shifts (YMM/ZMM). +defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts. // Old microcoded instructions that nobody use. def WriteMicrocoded : SchedWrite; @@ -238,18 +209,12 @@ // Vector width wrappers. def SchedWriteFAdd : X86SchedWriteWidths; -def SchedWriteFHAdd - : X86SchedWriteWidths; def SchedWriteFCmp : X86SchedWriteWidths; def SchedWriteFMul : X86SchedWriteWidths; def SchedWriteFMA : X86SchedWriteWidths; -def SchedWriteDPPD - : X86SchedWriteWidths; -def SchedWriteDPPS - : X86SchedWriteWidths; def SchedWriteFDiv : X86SchedWriteWidths; def SchedWriteFSqrt @@ -274,45 +239,40 @@ WriteFVarBlendY, WriteFVarBlendY>; def SchedWriteVecALU - : X86SchedWriteWidths; -def SchedWritePHAdd - : X86SchedWriteWidths; + : X86SchedWriteWidths; def SchedWriteVecLogic : X86SchedWriteWidths; def SchedWriteVecShift - : X86SchedWriteWidths; -def SchedWriteVecShiftImm - : X86SchedWriteWidths; + : X86SchedWriteWidths; def SchedWriteVarVecShift : X86SchedWriteWidths; + WriteVarVecShift, WriteVarVecShift>; def SchedWriteVecIMul : X86SchedWriteWidths; + WriteVecIMul, WriteVecIMul>; def SchedWritePMULLD : X86SchedWriteWidths; + WritePMULLD, WritePMULLD>; def SchedWriteMPSAD : X86SchedWriteWidths; + WriteMPSAD, WriteMPSAD>; def SchedWritePSADBW : X86SchedWriteWidths; + WritePSADBW, WritePSADBW>; def SchedWriteShuffle : X86SchedWriteWidths; + WriteShuffle, WriteShuffle>; def SchedWriteVarShuffle : X86SchedWriteWidths; + WriteVarShuffle, WriteVarShuffle>; def SchedWriteBlend - : X86SchedWriteWidths; + : X86SchedWriteWidths; def SchedWriteVarBlend : X86SchedWriteWidths; + WriteVarBlend, WriteVarBlend>; //===----------------------------------------------------------------------===// // Generic Processor Scheduler Models. Index: lib/Target/X86/X86ScheduleAtom.td =================================================================== --- lib/Target/X86/X86ScheduleAtom.td +++ lib/Target/X86/X86ScheduleAtom.td @@ -227,9 +227,6 @@ defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. -defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. -defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. -defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. @@ -255,35 +252,21 @@ def : WriteRes; defm : AtomWriteResPair; -defm : AtomWriteResPair; defm : AtomWriteResPair; defm : AtomWriteResPair; defm : AtomWriteResPair; -defm : AtomWriteResPair; -defm : AtomWriteResPair; -defm : AtomWriteResPair; -defm : AtomWriteResPair; defm : AtomWriteResPair; -defm : AtomWriteResPair; defm : AtomWriteResPair; -defm : AtomWriteResPair; defm : AtomWriteResPair; defm : AtomWriteResPair; -defm : AtomWriteResPair; defm : AtomWriteResPair; -defm : AtomWriteResPair; defm : AtomWriteResPair; -defm : AtomWriteResPair; defm : AtomWriteResPair; -defm : AtomWriteResPair; defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. -defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. -defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. -defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. //////////////////////////////////////////////////////////////////////////////// // Vector insert/extract operations. @@ -325,7 +308,6 @@ defm : AtomWriteResPair; defm : AtomWriteResPair; defm : AtomWriteResPair; -defm : AtomWriteResPair; //////////////////////////////////////////////////////////////////////////////// // Carry-less multiplication instructions. Index: lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBtVer2.td +++ lib/Target/X86/X86ScheduleBtVer2.td @@ -93,7 +93,7 @@ // folded loads. multiclass JWriteResIntPair ExePorts, - int Lat, list Res = [], int UOps = 1> { + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. def : WriteRes { let Latency = Lat; @@ -105,14 +105,14 @@ // latency. def : WriteRes { let Latency = !add(Lat, 3); - let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); + let ResourceCycles = !listconcat([1], Res); let NumMicroOps = UOps; } } multiclass JWriteResFpuPair ExePorts, - int Lat, list Res = [], int UOps = 1> { + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. def : WriteRes { let Latency = Lat; @@ -124,7 +124,7 @@ // latency. def : WriteRes { let Latency = !add(Lat, 5); - let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); + let ResourceCycles = !listconcat([1], Res); let NumMicroOps = UOps; } } @@ -325,9 +325,6 @@ defm : JWriteResFpuPair; // NOTE: Doesn't exist on Jaguar. defm : JWriteResFpuPair; // NOTE: Doesn't exist on Jaguar. defm : JWriteResFpuPair; // NOTE: Doesn't exist on Jaguar. -defm : JWriteResFpuPair; -defm : JWriteResFpuPair; -defm : JWriteResYMMPair; defm : JWriteResFpuPair; defm : JWriteResYMMPair; defm : JWriteResFpuPair; @@ -407,35 +404,21 @@ def : WriteRes; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; -defm : JWriteResFpuPair; -defm : JWriteResFpuPair; -defm : JWriteResFpuPair; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; defm : JWriteResFpuPair; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; defm : JWriteResFpuPair; defm : JWriteResFpuPair; // NOTE: Doesn't exist on Jaguar. defm : JWriteResFpuPair; defm : JWriteResFpuPair; // NOTE: Doesn't exist on Jaguar. defm : JWriteResFpuPair; // NOTE: Doesn't exist on Jaguar. -defm : JWriteResFpuPair; // NOTE: Doesn't exist on Jaguar. //////////////////////////////////////////////////////////////////////////////// // Vector insert/extract operations. @@ -468,7 +451,7 @@ defm : JWriteResFpuPair; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; +defm : JWriteResFpuPair; //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. @@ -477,7 +460,6 @@ defm : JWriteResFpuPair; defm : JWriteResYMMPair; defm : JWriteResFpuPair; -defm : JWriteResFpuPair; //////////////////////////////////////////////////////////////////////////////// // Carry-less multiplication instructions. @@ -486,6 +468,38 @@ defm : JWriteResFpuPair; //////////////////////////////////////////////////////////////////////////////// +// SSE4.1 instructions. +//////////////////////////////////////////////////////////////////////////////// + +def JWriteDPPS: SchedWriteRes<[JFPU1, JFPM, JFPA]> { + let Latency = 11; + let ResourceCycles = [1, 3, 3]; + let NumMicroOps = 5; +} +def : InstRW<[JWriteDPPS], (instrs DPPSrri, VDPPSrri)>; + +def JWriteDPPSLd: SchedWriteRes<[JLAGU, JFPU1, JFPM, JFPA]> { + let Latency = 16; + let ResourceCycles = [1, 1, 3, 3]; + let NumMicroOps = 5; +} +def : InstRW<[JWriteDPPSLd], (instrs DPPSrmi, VDPPSrmi)>; + +def JWriteDPPD: SchedWriteRes<[JFPU1, JFPM, JFPA]> { + let Latency = 9; + let ResourceCycles = [1, 3, 3]; + let NumMicroOps = 3; +} +def : InstRW<[JWriteDPPD], (instrs DPPDrri, VDPPDrri)>; + +def JWriteDPPDLd: SchedWriteRes<[JLAGU, JFPU1, JFPM, JFPA]> { + let Latency = 14; + let ResourceCycles = [1, 1, 3, 3]; + let NumMicroOps = 3; +} +def : InstRW<[JWriteDPPDLd], (instrs DPPDrmi, VDPPDrmi)>; + +//////////////////////////////////////////////////////////////////////////////// // SSE4A instructions. //////////////////////////////////////////////////////////////////////////////// @@ -531,6 +545,20 @@ // AVX instructions. //////////////////////////////////////////////////////////////////////////////// +def JWriteVDPPSY: SchedWriteRes<[JFPU1, JFPM, JFPA]> { + let Latency = 12; + let ResourceCycles = [2, 6, 6]; + let NumMicroOps = 10; +} +def : InstRW<[JWriteVDPPSY], (instrs VDPPSYrri)>; + +def JWriteVDPPSYLd: SchedWriteRes<[JLAGU, JFPU1, JFPM, JFPA]> { + let Latency = 17; + let ResourceCycles = [2, 2, 6, 6]; + let NumMicroOps = 10; +} +def : InstRW<[JWriteVDPPSYLd, ReadAfterLd], (instrs VDPPSYrmi)>; + def JWriteVMULYPD: SchedWriteRes<[JFPU1, JFPM]> { let Latency = 4; let ResourceCycles = [2, 4]; Index: lib/Target/X86/X86ScheduleSLM.td =================================================================== --- lib/Target/X86/X86ScheduleSLM.td +++ lib/Target/X86/X86ScheduleSLM.td @@ -144,9 +144,6 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; -defm : SLMWriteResPair; -defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; @@ -165,31 +162,19 @@ def : WriteRes { let Latency = 3; } def : WriteRes; -defm : SLMWriteResPair; -defm : SLMWriteResPair; -defm : SLMWriteResPair; -defm : SLMWriteResPair; -defm : SLMWriteResPair; +defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; // FIXME: The below is closer to correct, but caused some perf regressions. //defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; -defm : SLMWriteResPair; -defm : SLMWriteResPair; +defm : SLMWriteResPair; defm : SLMWriteResPair; // Vector insert/extract operations. @@ -209,7 +194,6 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; // String instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -295,6 +279,7 @@ let ResourceCycles = [10, 1]; } + def : WriteRes { let Latency = 100; } def : WriteRes { let Latency = 100; } def : WriteRes; @@ -305,15 +290,13 @@ def : WriteRes; defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; -defm : SLMWriteResPair; +defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; @@ -407,7 +390,7 @@ def SLMriteResGroup13 : SchedWriteRes<[SLM_MEC_RSV,SLM_FPC_RSV0,SLMFPDivider]> { let Latency = 74; let NumMicroOps = 1; - let ResourceCycles = [1,1,70]; + let ResourceCycles = [1,70]; } def: InstRW<[SLMriteResGroup13], (instregex "(V?)SQRTPDm")>; Index: lib/Target/X86/X86ScheduleZnver1.td =================================================================== --- lib/Target/X86/X86ScheduleZnver1.td +++ lib/Target/X86/X86ScheduleZnver1.td @@ -100,7 +100,7 @@ // This multiclass is for folded loads for integer units. multiclass ZnWriteResPair ExePorts, - int Lat, list Res = [], int UOps = 1> { + int Lat, list Res = [1], int UOps = 1> { // Register variant takes 1-cycle on Execution Port. def : WriteRes { let Latency = Lat; @@ -112,7 +112,7 @@ // adds 4 cycles to the latency. def : WriteRes { let Latency = !add(Lat, 4); - let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); + let ResourceCycles = !listconcat([1], Res); let NumMicroOps = !add(UOps, 1); } } @@ -120,7 +120,7 @@ // This multiclass is for folded loads for floating point units. multiclass ZnWriteResFpuPair ExePorts, - int Lat, list Res = [], int UOps = 1> { + int Lat, list Res = [1], int UOps = 1> { // Register variant takes 1-cycle on Execution Port. def : WriteRes { let Latency = Lat; @@ -132,7 +132,7 @@ // adds 7 cycles to the latency. def : WriteRes { let Latency = !add(Lat, 7); - let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); + let ResourceCycles = !listconcat([1], Res); let NumMicroOps = UOps; } } @@ -191,6 +191,8 @@ def : WriteRes; def : WriteRes { let Latency = 8; } +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; @@ -201,7 +203,6 @@ defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; @@ -233,33 +234,22 @@ def : WriteRes { let Latency = 8; } defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; // FIXME -defm : ZnWriteResFpuPair; // FIXME defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; // Vector Shift Operations -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; // Vector insert/extract operations. defm : ZnWriteResFpuPair; @@ -293,28 +283,24 @@ defm : ZnWriteResFpuPair; // Microcoded Instructions -def ZnWriteMicrocoded : SchedWriteRes<[]> { - let Latency = 100; -} - -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; +let Latency = 100 in { + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + } //=== Regex based InstRW ===// // Notation: @@ -1026,10 +1012,13 @@ // HADD, HSUB PS/PD // PHADD|PHSUB (S) W/D. -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; +def : InstRW<[WriteMicrocoded], (instregex "MMX_PHADD(W|D)r(r|m)", + "MMX_PHADDSWr(r|m)", + "MMX_PHSUB(W|D)r(r|m)", + "MMX_PHSUBSWrr", + "(V?)PH(ADD|SUB)(W|D)(Y?)r(r|m)", + "(V?)PH(ADD|SUB)SW(Y?)r(r|m)")>; + // PCMPGTQ. def ZnWritePCMPGTQr : SchedWriteRes<[ZnFPU03]>; @@ -1049,6 +1038,18 @@ def : InstRW<[ZnWritePCMPGTQYm], (instregex "(V?)PCMPGTQYrm")>; // PMULLD. +// x,x. +def ZnWritePMULLDr : SchedWriteRes<[ZnFPU0]> { + let Latency = 4; +} +// ymm. +def ZnWritePMULLDYr : SchedWriteRes<[ZnFPU0]> { + let Latency = 5; + let ResourceCycles = [2]; +} +def : InstRW<[ZnWritePMULLDr], (instregex "(V?)PMULLDrr")>; +def : InstRW<[ZnWritePMULLDYr], (instregex "(V?)PMULLDYrr")>; + // x,m. def ZnWritePMULLDm : SchedWriteRes<[ZnAGU, ZnFPU0]> { let Latency = 11; @@ -1447,10 +1448,7 @@ //-- Arithmetic instructions --// // HADD, HSUB PS/PD -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; -def : SchedAlias; +def : InstRW<[WriteMicrocoded], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)r(r|m)")>; // MULL SS/SD PS/PD. // x,x / v,v,v. @@ -1540,19 +1538,17 @@ // DPPS. // x,x,i / v,v,v,i. -def : SchedAlias; -def : SchedAlias; +def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPS(Y?)rri")>; // x,m,i / v,v,m,i. -def : SchedAlias; -def : SchedAlias; +def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPS(Y?)rmi")>; // DPPD. // x,x,i. -def : SchedAlias; +def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPDrri")>; // x,m,i. -def : SchedAlias; +def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPDrmi")>; // VSQRTPS. // y,y. Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp =================================================================== --- lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -82,7 +82,7 @@ Result.first = Candidate; // Fill in the mask bit derived from the shift constant. - Result.second.setBit(BitIndex); + Result.second |= (1 << BitIndex); return Result.first == Candidate; } Index: lib/Transforms/InstCombine/InstCombineSelect.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineSelect.cpp +++ lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -100,41 +100,23 @@ return nullptr; } - // In general, when both constants are non-zero, we would need an offset to - // replace the select. This would require more instructions than we started - // with. But there's one special-case that we handle here because it can - // simplify/reduce the instructions. + // If both select arms are non-zero see if we have a select of the form + // 'x ? 2^n + TC : FC'. Then we can offset both arms by C, use the logic + // for 'x ? 2^n : 0' and fix the thing up at the end. APInt TC = *SelTC; APInt FC = *SelFC; + APInt Offset(TC.getBitWidth(), 0); if (!TC.isNullValue() && !FC.isNullValue()) { - // If the select constants differ by exactly one bit and that's the same - // bit that is masked and checked by the select condition, the select can - // be replaced by bitwise logic to set/clear one bit of the constant result. - if (TC.getBitWidth() != AndMask.getBitWidth() || (TC ^ FC) != AndMask) + if ((TC - FC).isPowerOf2()) + Offset = FC; + else if ((FC - TC).isPowerOf2()) + Offset = TC; + else return nullptr; - if (CreateAnd) { - // If we have to create an 'and', then we must kill the cmp to not - // increase the instruction count. - if (!Cmp->hasOneUse()) - return nullptr; - V = Builder.CreateAnd(V, ConstantInt::get(SelType, AndMask)); - } - bool ExtraBitInTC = TC.ugt(FC); - if (Pred == ICmpInst::ICMP_EQ) { - // If the masked bit in V is clear, clear or set the bit in the result: - // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) ^ TC - // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) | TC - Constant *C = ConstantInt::get(SelType, TC); - return ExtraBitInTC ? Builder.CreateXor(V, C) : Builder.CreateOr(V, C); - } - if (Pred == ICmpInst::ICMP_NE) { - // If the masked bit in V is set, set or clear the bit in the result: - // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) | FC - // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) ^ FC - Constant *C = ConstantInt::get(SelType, FC); - return ExtraBitInTC ? Builder.CreateOr(V, C) : Builder.CreateXor(V, C); - } - llvm_unreachable("Only expecting equality predicates"); + + // Adjust TC and FC by the offset. + TC -= Offset; + FC -= Offset; } // Make sure one of the select arms is a power-of-2. @@ -170,6 +152,9 @@ if (ShouldNotVal) V = Builder.CreateXor(V, ValC); + // Apply an offset if needed. + if (!Offset.isNullValue()) + V = Builder.CreateAdd(V, ConstantInt::get(V->getType(), Offset)); return V; } @@ -805,13 +790,51 @@ bool Changed = adjustMinMax(SI, *ICI); + ICmpInst::Predicate Pred = ICI->getPredicate(); + Value *CmpLHS = ICI->getOperand(0); + Value *CmpRHS = ICI->getOperand(1); + + // Transform (X >s -1) ? C1 : C2 --> ((X >>s 31) & (C2 - C1)) + C1 + // and (X ((X >>s 31) & (C2 - C1)) + C1 + // FIXME: Type and constness constraints could be lifted, but we have to + // watch code size carefully. We should consider xor instead of + // sub/add when we decide to do that. + // TODO: Merge this with foldSelectICmpAnd somehow. + if (CmpLHS->getType()->isIntOrIntVectorTy() && + CmpLHS->getType() == TrueVal->getType()) { + const APInt *C1, *C2; + if (match(TrueVal, m_APInt(C1)) && match(FalseVal, m_APInt(C2))) { + ICmpInst::Predicate Pred = ICI->getPredicate(); + Value *X; + APInt Mask; + if (decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, X, Mask, false)) { + if (Mask.isSignMask()) { + assert(X == CmpLHS && "Expected to use the compare input directly"); + assert(ICmpInst::isEquality(Pred) && "Expected equality predicate"); + + if (Pred == ICmpInst::ICMP_NE) + std::swap(C1, C2); + + // This shift results in either -1 or 0. + Value *AShr = Builder.CreateAShr(X, Mask.getBitWidth() - 1); + + // Check if we can express the operation with a single or. + if (C2->isAllOnesValue()) + return replaceInstUsesWith(SI, Builder.CreateOr(AShr, *C1)); + + Value *And = Builder.CreateAnd(AShr, *C2 - *C1); + return replaceInstUsesWith(SI, Builder.CreateAdd(And, + ConstantInt::get(And->getType(), *C1))); + } + } + } + } + if (Value *V = foldSelectICmpAnd(SI, ICI, Builder)) return replaceInstUsesWith(SI, V); // NOTE: if we wanted to, this is where to detect integer MIN/MAX - ICmpInst::Predicate Pred = ICI->getPredicate(); - Value *CmpLHS = ICI->getOperand(0); - Value *CmpRHS = ICI->getOperand(1); + if (CmpRHS != CmpLHS && isa(CmpRHS)) { if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) { // Transform (X == C) ? X : Y -> (X == C) ? C : Y Index: lib/Transforms/Instrumentation/GCOVProfiling.cpp =================================================================== --- lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -17,7 +17,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/Sequence.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" @@ -36,8 +35,8 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" +#include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include #include @@ -887,195 +886,46 @@ Constant *SummaryInfo = getSummaryInfoFunc(); Constant *EndFile = getEndFileFunc(); - NamedMDNode *CUNodes = M->getNamedMetadata("llvm.dbg.cu"); - if (!CUNodes) { - Builder.CreateRetVoid(); - return WriteoutF; - } - - // Collect the relevant data into a large constant data structure that we can - // walk to write out everything. - StructType *StartFileCallArgsTy = StructType::create( - {Builder.getInt8PtrTy(), Builder.getInt8PtrTy(), Builder.getInt32Ty()}); - StructType *EmitFunctionCallArgsTy = StructType::create( - {Builder.getInt32Ty(), Builder.getInt8PtrTy(), Builder.getInt32Ty(), - Builder.getInt8Ty(), Builder.getInt32Ty()}); - StructType *EmitArcsCallArgsTy = StructType::create( - {Builder.getInt32Ty(), Builder.getInt64Ty()->getPointerTo()}); - StructType *FileInfoTy = - StructType::create({StartFileCallArgsTy, Builder.getInt32Ty(), - EmitFunctionCallArgsTy->getPointerTo(), - EmitArcsCallArgsTy->getPointerTo()}); - - Constant *Zero32 = Builder.getInt32(0); - // Build an explicit array of two zeros for use in ConstantExpr GEP building. - Constant *TwoZero32s[] = {Zero32, Zero32}; - - SmallVector FileInfos; - for (int i : llvm::seq(0, CUNodes->getNumOperands())) { - auto *CU = cast(CUNodes->getOperand(i)); + NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu"); + if (CU_Nodes) { + for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) { + auto *CU = cast(CU_Nodes->getOperand(i)); - // Skip module skeleton (and module) CUs. - if (CU->getDWOId()) - continue; + // Skip module skeleton (and module) CUs. + if (CU->getDWOId()) + continue; - std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA); - uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i]; - auto *StartFileCallArgs = ConstantStruct::get( - StartFileCallArgsTy, {Builder.CreateGlobalStringPtr(FilenameGcda), - Builder.CreateGlobalStringPtr(ReversedVersion), - Builder.getInt32(CfgChecksum)}); - - SmallVector EmitFunctionCallArgsArray; - SmallVector EmitArcsCallArgsArray; - for (int j : llvm::seq(0, CountersBySP.size())) { - auto *SP = cast_or_null(CountersBySP[j].second); - uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum(); - EmitFunctionCallArgsArray.push_back(ConstantStruct::get( - EmitFunctionCallArgsTy, - {Builder.getInt32(j), - Options.FunctionNamesInData - ? Builder.CreateGlobalStringPtr(getFunctionName(SP)) - : Constant::getNullValue(Builder.getInt8PtrTy()), - Builder.getInt32(FuncChecksum), - Builder.getInt8(Options.UseCfgChecksum), - Builder.getInt32(CfgChecksum)})); - - GlobalVariable *GV = CountersBySP[j].first; - unsigned Arcs = cast(GV->getValueType())->getNumElements(); - EmitArcsCallArgsArray.push_back(ConstantStruct::get( - EmitArcsCallArgsTy, - {Builder.getInt32(Arcs), ConstantExpr::getInBoundsGetElementPtr( - GV->getValueType(), GV, TwoZero32s)})); + std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA); + uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i]; + Builder.CreateCall(StartFile, + {Builder.CreateGlobalStringPtr(FilenameGcda), + Builder.CreateGlobalStringPtr(ReversedVersion), + Builder.getInt32(CfgChecksum)}); + for (unsigned j = 0, e = CountersBySP.size(); j != e; ++j) { + auto *SP = cast_or_null(CountersBySP[j].second); + uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum(); + Builder.CreateCall( + EmitFunction, + {Builder.getInt32(j), + Options.FunctionNamesInData + ? Builder.CreateGlobalStringPtr(getFunctionName(SP)) + : Constant::getNullValue(Builder.getInt8PtrTy()), + Builder.getInt32(FuncChecksum), + Builder.getInt8(Options.UseCfgChecksum), + Builder.getInt32(CfgChecksum)}); + + GlobalVariable *GV = CountersBySP[j].first; + unsigned Arcs = + cast(GV->getValueType())->getNumElements(); + Builder.CreateCall(EmitArcs, {Builder.getInt32(Arcs), + Builder.CreateConstGEP2_64(GV, 0, 0)}); + } + Builder.CreateCall(SummaryInfo, {}); + Builder.CreateCall(EndFile, {}); } - // Create global arrays for the two emit calls. - int CountersSize = CountersBySP.size(); - assert(CountersSize == (int)EmitFunctionCallArgsArray.size() && - "Mismatched array size!"); - assert(CountersSize == (int)EmitArcsCallArgsArray.size() && - "Mismatched array size!"); - auto *EmitFunctionCallArgsArrayTy = - ArrayType::get(EmitFunctionCallArgsTy, CountersSize); - auto *EmitFunctionCallArgsArrayGV = new GlobalVariable( - *M, EmitFunctionCallArgsArrayTy, /*isConstant*/ true, - GlobalValue::InternalLinkage, - ConstantArray::get(EmitFunctionCallArgsArrayTy, - EmitFunctionCallArgsArray), - Twine("__llvm_internal_gcov_emit_function_args.") + Twine(i)); - auto *EmitArcsCallArgsArrayTy = - ArrayType::get(EmitArcsCallArgsTy, CountersSize); - EmitFunctionCallArgsArrayGV->setUnnamedAddr( - GlobalValue::UnnamedAddr::Global); - auto *EmitArcsCallArgsArrayGV = new GlobalVariable( - *M, EmitArcsCallArgsArrayTy, /*isConstant*/ true, - GlobalValue::InternalLinkage, - ConstantArray::get(EmitArcsCallArgsArrayTy, EmitArcsCallArgsArray), - Twine("__llvm_internal_gcov_emit_arcs_args.") + Twine(i)); - EmitArcsCallArgsArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - - FileInfos.push_back(ConstantStruct::get( - FileInfoTy, - {StartFileCallArgs, Builder.getInt32(CountersSize), - ConstantExpr::getInBoundsGetElementPtr(EmitFunctionCallArgsArrayTy, - EmitFunctionCallArgsArrayGV, - TwoZero32s), - ConstantExpr::getInBoundsGetElementPtr( - EmitArcsCallArgsArrayTy, EmitArcsCallArgsArrayGV, TwoZero32s)})); } - // If we didn't find anything to actually emit, bail on out. - if (FileInfos.empty()) { - Builder.CreateRetVoid(); - return WriteoutF; - } - - // To simplify code, we cap the number of file infos we write out to fit - // easily in a 32-bit signed integer. This gives consistent behavior between - // 32-bit and 64-bit systems without requiring (potentially very slow) 64-bit - // operations on 32-bit systems. It also seems unreasonable to try to handle - // more than 2 billion files. - if ((int64_t)FileInfos.size() > (int64_t)INT_MAX) - FileInfos.resize(INT_MAX); - - // Create a global for the entire data structure so we can walk it more - // easily. - auto *FileInfoArrayTy = ArrayType::get(FileInfoTy, FileInfos.size()); - auto *FileInfoArrayGV = new GlobalVariable( - *M, FileInfoArrayTy, /*isConstant*/ true, GlobalValue::InternalLinkage, - ConstantArray::get(FileInfoArrayTy, FileInfos), - "__llvm_internal_gcov_emit_file_info"); - FileInfoArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - - // Create the CFG for walking this data structure. - auto *FileLoopHeader = - BasicBlock::Create(*Ctx, "file.loop.header", WriteoutF); - auto *CounterLoopHeader = - BasicBlock::Create(*Ctx, "counter.loop.header", WriteoutF); - auto *FileLoopLatch = BasicBlock::Create(*Ctx, "file.loop.latch", WriteoutF); - auto *ExitBB = BasicBlock::Create(*Ctx, "exit", WriteoutF); - - // We always have at least one file, so just branch to the header. - Builder.CreateBr(FileLoopHeader); - - // The index into the files structure is our loop induction variable. - Builder.SetInsertPoint(FileLoopHeader); - PHINode *IV = - Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2); - IV->addIncoming(Builder.getInt32(0), BB); - auto *FileInfoPtr = - Builder.CreateInBoundsGEP(FileInfoArrayGV, {Builder.getInt32(0), IV}); - auto *StartFileCallArgsPtr = Builder.CreateStructGEP(FileInfoPtr, 0); - Builder.CreateCall( - StartFile, - {Builder.CreateLoad(Builder.CreateStructGEP(StartFileCallArgsPtr, 0)), - Builder.CreateLoad(Builder.CreateStructGEP(StartFileCallArgsPtr, 1)), - Builder.CreateLoad(Builder.CreateStructGEP(StartFileCallArgsPtr, 2))}); - auto *NumCounters = - Builder.CreateLoad(Builder.CreateStructGEP(FileInfoPtr, 1)); - auto *EmitFunctionCallArgsArray = - Builder.CreateLoad(Builder.CreateStructGEP(FileInfoPtr, 2)); - auto *EmitArcsCallArgsArray = - Builder.CreateLoad(Builder.CreateStructGEP(FileInfoPtr, 3)); - auto *EnterCounterLoopCond = - Builder.CreateICmpSLT(Builder.getInt32(0), NumCounters); - Builder.CreateCondBr(EnterCounterLoopCond, CounterLoopHeader, FileLoopLatch); - - Builder.SetInsertPoint(CounterLoopHeader); - auto *JV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2); - JV->addIncoming(Builder.getInt32(0), FileLoopHeader); - auto *EmitFunctionCallArgsPtr = - Builder.CreateInBoundsGEP(EmitFunctionCallArgsArray, {JV}); - Builder.CreateCall( - EmitFunction, - {Builder.CreateLoad(Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 0)), - Builder.CreateLoad(Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 1)), - Builder.CreateLoad(Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 2)), - Builder.CreateLoad(Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 3)), - Builder.CreateLoad( - Builder.CreateStructGEP(EmitFunctionCallArgsPtr, 4))}); - auto *EmitArcsCallArgsPtr = - Builder.CreateInBoundsGEP(EmitArcsCallArgsArray, {JV}); - Builder.CreateCall( - EmitArcs, - {Builder.CreateLoad(Builder.CreateStructGEP(EmitArcsCallArgsPtr, 0)), - Builder.CreateLoad(Builder.CreateStructGEP(EmitArcsCallArgsPtr, 1))}); - auto *NextJV = Builder.CreateAdd(JV, Builder.getInt32(1)); - auto *CounterLoopCond = Builder.CreateICmpSLT(NextJV, NumCounters); - Builder.CreateCondBr(CounterLoopCond, CounterLoopHeader, FileLoopLatch); - JV->addIncoming(NextJV, CounterLoopHeader); - - Builder.SetInsertPoint(FileLoopLatch); - Builder.CreateCall(SummaryInfo, {}); - Builder.CreateCall(EndFile, {}); - auto *NextIV = Builder.CreateAdd(IV, Builder.getInt32(1)); - auto *FileLoopCond = - Builder.CreateICmpSLT(NextIV, Builder.getInt32(FileInfos.size())); - Builder.CreateCondBr(FileLoopCond, FileLoopHeader, ExitBB); - IV->addIncoming(NextIV, FileLoopLatch); - - Builder.SetInsertPoint(ExitBB); Builder.CreateRetVoid(); - return WriteoutF; } Index: lib/Transforms/ObjCARC/ObjCARCContract.cpp =================================================================== --- lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -597,35 +597,35 @@ // trivially dominate itself, which would lead us to rewriting its // argument in terms of its return value, which would lead to // infinite loops in GetArgRCIdentityRoot. - if (!DT->isReachableFromEntry(U) || !DT->dominates(Inst, U)) - continue; - - Changed = true; - Instruction *Replacement = Inst; - Type *UseTy = U.get()->getType(); - if (PHINode *PHI = dyn_cast(U.getUser())) { - // For PHI nodes, insert the bitcast in the predecessor block. - unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo); - BasicBlock *BB = PHI->getIncomingBlock(ValNo); - if (Replacement->getType() != UseTy) - Replacement = new BitCastInst(Replacement, UseTy, "", &BB->back()); - // While we're here, rewrite all edges for this PHI, rather - // than just one use at a time, to minimize the number of - // bitcasts we emit. - for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) - if (PHI->getIncomingBlock(i) == BB) { - // Keep the UI iterator valid. - if (UI != UE && - &PHI->getOperandUse( - PHINode::getOperandNumForIncomingValue(i)) == &*UI) - ++UI; - PHI->setIncomingValue(i, Replacement); - } - } else { - if (Replacement->getType() != UseTy) - Replacement = new BitCastInst(Replacement, UseTy, "", - cast(U.getUser())); - U.set(Replacement); + if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) { + Changed = true; + Instruction *Replacement = Inst; + Type *UseTy = U.get()->getType(); + if (PHINode *PHI = dyn_cast(U.getUser())) { + // For PHI nodes, insert the bitcast in the predecessor block. + unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo); + BasicBlock *BB = PHI->getIncomingBlock(ValNo); + if (Replacement->getType() != UseTy) + Replacement = new BitCastInst(Replacement, UseTy, "", + &BB->back()); + // While we're here, rewrite all edges for this PHI, rather + // than just one use at a time, to minimize the number of + // bitcasts we emit. + for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) + if (PHI->getIncomingBlock(i) == BB) { + // Keep the UI iterator valid. + if (UI != UE && + &PHI->getOperandUse( + PHINode::getOperandNumForIncomingValue(i)) == &*UI) + ++UI; + PHI->setIncomingValue(i, Replacement); + } + } else { + if (Replacement->getType() != UseTy) + Replacement = new BitCastInst(Replacement, UseTy, "", + cast(U.getUser())); + U.set(Replacement); + } } } }; Index: lib/Transforms/Scalar/DeadStoreElimination.cpp =================================================================== --- lib/Transforms/Scalar/DeadStoreElimination.cpp +++ lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -343,8 +343,7 @@ const TargetLibraryInfo &TLI, int64_t &EarlierOff, int64_t &LaterOff, Instruction *DepWrite, - InstOverlapIntervalsTy &IOL, - AliasAnalysis &AA) { + InstOverlapIntervalsTy &IOL) { // If we don't know the sizes of either access, then we can't do a comparison. if (Later.Size == MemoryLocation::UnknownSize || Earlier.Size == MemoryLocation::UnknownSize) @@ -355,7 +354,7 @@ // If the start pointers are the same, we just have to compare sizes to see if // the later store was larger than the earlier store. - if (P1 == P2 || AA.isMustAlias(P1, P2)) { + if (P1 == P2) { // Make sure that the Later size is >= the Earlier size. if (Later.Size >= Earlier.Size) return OW_Complete; @@ -1163,8 +1162,9 @@ if (isRemovable(DepWrite) && !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { int64_t InstWriteOffset, DepWriteOffset; - OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, - InstWriteOffset, DepWrite, IOL, *AA); + OverwriteResult OR = + isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset, + DepWrite, IOL); if (OR == OW_Complete) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); Index: lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp =================================================================== --- lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -925,12 +925,11 @@ return None; } const SCEV* StepRec = IndVarBase->getStepRecurrence(SE); - if (!isa(StepRec)) { + ConstantInt *StepCI = dyn_cast(StepRec)->getValue(); + if (!StepCI) { FailureReason = "LHS in icmp not induction variable"; return None; } - ConstantInt *StepCI = cast(StepRec)->getValue(); - if (ICI->isEquality() && !HasNoSignedWrap(IndVarBase)) { FailureReason = "LHS in icmp needs nsw for equality predicates"; return None; Index: lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1196,13 +1196,14 @@ VarX1 = DefX2->getOperand(0); SubOneOp = dyn_cast(DefX2->getOperand(1)); } - if (!SubOneOp || SubOneOp->getOperand(0) != VarX1) + if (!SubOneOp) return false; - ConstantInt *Dec = dyn_cast(SubOneOp->getOperand(1)); + Instruction *SubInst = cast(SubOneOp); + ConstantInt *Dec = dyn_cast(SubInst->getOperand(1)); if (!Dec || - !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) || - (SubOneOp->getOpcode() == Instruction::Add && + !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || + (SubInst->getOpcode() == Instruction::Add && Dec->isMinusOne()))) { return false; } @@ -1372,13 +1373,13 @@ bool IsCntPhiUsedOutsideLoop = false; for (User *U : CntPhi->users()) - if (!CurLoop->contains(cast(U))) { + if (!CurLoop->contains(dyn_cast(U))) { IsCntPhiUsedOutsideLoop = true; break; } bool IsCntInstUsedOutsideLoop = false; for (User *U : CntInst->users()) - if (!CurLoop->contains(cast(U))) { + if (!CurLoop->contains(dyn_cast(U))) { IsCntInstUsedOutsideLoop = true; break; } @@ -1416,8 +1417,9 @@ // br i1 %tobool IRBuilder<> Builder(PH->getTerminator()); - const Value *Args[] = + SmallVector Ops = {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()}; + ArrayRef Args(Ops); if (CurLoop->getHeader()->size() != 6 && TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) > TargetTransformInfo::TCC_Basic) @@ -1539,7 +1541,7 @@ void LoopIdiomRecognize::transformLoopToCountable( BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX, const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { - BranchInst *PreheaderBr = cast(Preheader->getTerminator()); + BranchInst *PreheaderBr = dyn_cast(Preheader->getTerminator()); // Step 1: Insert the CTLZ instruction at the end of the preheader block // Count = BitWidth - CTLZ(InitX); @@ -1587,7 +1589,7 @@ // ... // Br: loop if (Dec != 0) BasicBlock *Body = *(CurLoop->block_begin()); - auto *LbBr = cast(Body->getTerminator()); + auto *LbBr = dyn_cast(Body->getTerminator()); ICmpInst *LbCond = cast(LbBr->getCondition()); Type *Ty = Count->getType(); @@ -1624,7 +1626,7 @@ Instruction *CntInst, PHINode *CntPhi, Value *Var) { BasicBlock *PreHead = CurLoop->getLoopPreheader(); - auto *PreCondBr = cast(PreCondBB->getTerminator()); + auto *PreCondBr = dyn_cast(PreCondBB->getTerminator()); const DebugLoc DL = CntInst->getDebugLoc(); // Assuming before transformation, the loop is following: @@ -1695,7 +1697,7 @@ // do { cnt++; x &= x-1; t--) } while (t > 0); BasicBlock *Body = *(CurLoop->block_begin()); { - auto *LbBr = cast(Body->getTerminator()); + auto *LbBr = dyn_cast(Body->getTerminator()); ICmpInst *LbCond = cast(LbBr->getCondition()); Type *Ty = TripCnt->getType(); Index: lib/Transforms/Scalar/Reassociate.cpp =================================================================== --- lib/Transforms/Scalar/Reassociate.cpp +++ lib/Transforms/Scalar/Reassociate.cpp @@ -810,7 +810,7 @@ /// pushing the negates through adds. These will be revisited to see if /// additional opportunities have been exposed. static Value *NegateValue(Value *V, Instruction *BI, - ReassociatePass::OrderedSet &ToRedo) { + SetVector> &ToRedo) { if (auto *C = dyn_cast(V)) return C->getType()->isFPOrFPVectorTy() ? ConstantExpr::getFNeg(C) : ConstantExpr::getNeg(C); @@ -924,8 +924,8 @@ /// If we have (X-Y), and if either X is an add, or if this is only used by an /// add, transform this into (X+(0-Y)) to promote better reassociation. -static BinaryOperator *BreakUpSubtract(Instruction *Sub, - ReassociatePass::OrderedSet &ToRedo) { +static BinaryOperator * +BreakUpSubtract(Instruction *Sub, SetVector> &ToRedo) { // Convert a subtract into an add and a neg instruction. This allows sub // instructions to be commuted with other add instructions. // @@ -1871,8 +1871,8 @@ // Remove dead instructions and if any operands are trivially dead add them to // Insts so they will be removed as well. -void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I, - OrderedSet &Insts) { +void ReassociatePass::RecursivelyEraseDeadInsts( + Instruction *I, SetVector> &Insts) { assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); SmallVector Ops(I->op_begin(), I->op_end()); ValueRankMap.erase(I); @@ -2333,7 +2333,7 @@ // Make a copy of all the instructions to be redone so we can remove dead // instructions. - OrderedSet ToRedo(RedoInsts); + SetVector> ToRedo(RedoInsts); // Iterate over all instructions to be reevaluated and remove trivially dead // instructions. If any operand of the trivially dead instruction becomes // dead mark it for deletion as well. Continue this process until all @@ -2349,8 +2349,7 @@ // Now that we have removed dead instructions, we can reoptimize the // remaining instructions. while (!RedoInsts.empty()) { - Instruction *I = RedoInsts.front(); - RedoInsts.erase(RedoInsts.begin()); + Instruction *I = RedoInsts.pop_back_val(); if (isInstructionTriviallyDead(I)) EraseInst(I); else Index: test/Analysis/MemorySSA/invariant-groups.ll =================================================================== --- test/Analysis/MemorySSA/invariant-groups.ll +++ test/Analysis/MemorySSA/invariant-groups.ll @@ -1,7 +1,7 @@ ; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s ; ; Currently, MemorySSA doesn't support invariant groups. So, we should ignore -; launder.invariant.group intrinsics entirely. We'll need to pay attention to +; invariant.group.barrier intrinsics entirely. We'll need to pay attention to ; them when/if we decide to support invariant groups. @g = external global i32 @@ -17,8 +17,8 @@ %1 = bitcast i32* %a to i8* ; CHECK: 3 = MemoryDef(2) -; CHECK-NEXT: %a8 = call i8* @llvm.launder.invariant.group.p0i8(i8* %1) - %a8 = call i8* @llvm.launder.invariant.group.p0i8(i8* %1) +; CHECK-NEXT: %a8 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %1) + %a8 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %1) %a32 = bitcast i8* %a8 to i32* ; This have to be MemoryUse(2), because we can't skip the barrier based on @@ -36,8 +36,8 @@ %1 = bitcast i32* %a to i8* ; CHECK: 2 = MemoryDef(1) -; CHECK-NEXT: %a8 = call i8* @llvm.launder.invariant.group.p0i8(i8* %1) - %a8 = call i8* @llvm.launder.invariant.group.p0i8(i8* %1) +; CHECK-NEXT: %a8 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %1) + %a8 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %1) %a32 = bitcast i8* %a8 to i32* ; We can skip the barrier only if the "skip" is not based on !invariant.group. @@ -55,8 +55,8 @@ %1 = bitcast i32* %a to i8* ; CHECK: 1 = MemoryDef(liveOnEntry) -; CHECK-NEXT: %a8 = call i8* @llvm.launder.invariant.group.p0i8(i8* %1) - %a8 = call i8* @llvm.launder.invariant.group.p0i8(i8* %1) +; CHECK-NEXT: %a8 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %1) + %a8 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %1) %a32 = bitcast i8* %a8 to i32* ; We can skip the barrier only if the "skip" is not based on !invariant.group. @@ -86,8 +86,8 @@ store i32 1, i32* @g, align 4 %1 = bitcast i32* %a to i8* ; CHECK: 3 = MemoryDef(2) -; CHECK-NEXT: %a8 = call i8* @llvm.launder.invariant.group.p0i8(i8* %1) - %a8 = call i8* @llvm.launder.invariant.group.p0i8(i8* %1) +; CHECK-NEXT: %a8 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %1) + %a8 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %1) %a32 = bitcast i8* %a8 to i32* ; CHECK: MemoryUse(2) @@ -145,8 +145,8 @@ call void @clobber8(i8* %p) ; CHECK: 3 = MemoryDef(2) -; CHECK-NEXT: %after = call i8* @llvm.launder.invariant.group.p0i8(i8* %p) - %after = call i8* @llvm.launder.invariant.group.p0i8(i8* %p) +; CHECK-NEXT: %after = call i8* @llvm.invariant.group.barrier.p0i8(i8* %p) + %after = call i8* @llvm.invariant.group.barrier.p0i8(i8* %p) br i1 undef, label %Loop.Body, label %Loop.End Loop.Body: @@ -192,8 +192,8 @@ call void @clobber8(i8* %p) ; CHECK: 3 = MemoryDef(2) -; CHECK-NEXT: %after = call i8* @llvm.launder.invariant.group.p0i8(i8* %p) - %after = call i8* @llvm.launder.invariant.group.p0i8(i8* %p) +; CHECK-NEXT: %after = call i8* @llvm.invariant.group.barrier.p0i8(i8* %p) + %after = call i8* @llvm.invariant.group.barrier.p0i8(i8* %p) br i1 undef, label %Loop.Body, label %Loop.End Loop.Body: @@ -253,8 +253,8 @@ ; CHECK-NEXT: call void @clobber call void @clobber8(i8* %p) ; CHECK: 3 = MemoryDef(2) -; CHECK-NEXT: %after = call i8* @llvm.launder.invariant.group.p0i8(i8* %p) - %after = call i8* @llvm.launder.invariant.group.p0i8(i8* %p) +; CHECK-NEXT: %after = call i8* @llvm.invariant.group.barrier.p0i8(i8* %p) + %after = call i8* @llvm.invariant.group.barrier.p0i8(i8* %p) br i1 undef, label %Loop.Pre, label %Loop.End Loop.Pre: @@ -302,12 +302,12 @@ ; CHECK-NEXT: store i8 42, i8* %ptr, !invariant.group !0 store i8 42, i8* %ptr, !invariant.group !0 ; CHECK: 2 = MemoryDef(1) -; CHECK-NEXT: call i8* @llvm.launder.invariant.group - %ptr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) +; CHECK-NEXT: call i8* @llvm.invariant.group.barrier + %ptr2 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) ; FIXME: This one could be CSEd. ; CHECK: 3 = MemoryDef(2) -; CHECK: call i8* @llvm.launder.invariant.group - %ptr3 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) +; CHECK: call i8* @llvm.invariant.group.barrier + %ptr3 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) ; CHECK: 4 = MemoryDef(3) ; CHECK-NEXT: call void @clobber8(i8* %ptr) call void @clobber8(i8* %ptr) @@ -331,13 +331,13 @@ ; CHECK-NEXT: store i8 42, i8* %ptr, !invariant.group !0 store i8 42, i8* %ptr, !invariant.group !0 ; CHECK: 2 = MemoryDef(1) -; CHECK-NEXT: call i8* @llvm.launder.invariant.group - %ptr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) +; CHECK-NEXT: call i8* @llvm.invariant.group.barrier + %ptr2 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) ; CHECK: 3 = MemoryDef(2) store i8 43, i8* %ptr ; CHECK: 4 = MemoryDef(3) -; CHECK-NEXT: call i8* @llvm.launder.invariant.group - %ptr3 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) +; CHECK-NEXT: call i8* @llvm.invariant.group.barrier + %ptr3 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) ; CHECK: 5 = MemoryDef(4) ; CHECK-NEXT: call void @clobber8(i8* %ptr) call void @clobber8(i8* %ptr) @@ -354,7 +354,7 @@ } -declare i8* @llvm.launder.invariant.group.p0i8(i8*) +declare i8* @llvm.invariant.group.barrier.p0i8(i8*) declare void @clobber(i32*) declare void @clobber8(i8*) declare void @use(i8* readonly) Index: test/Analysis/ScalarEvolution/exact_iter_count.ll =================================================================== --- test/Analysis/ScalarEvolution/exact_iter_count.ll +++ test/Analysis/ScalarEvolution/exact_iter_count.ll @@ -25,37 +25,3 @@ side.exit: ret void } - -define void @test_02(i1 %c) { - -; CHECK-LABEL: Determining loop execution counts for: @test_02 -; CHECK-NEXT: Loop %loop: backedge-taken count is 50 - -entry: - br label %loop - -loop: - %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ] - br i1 %c, label %if.true, label %if.false - -if.true: - br label %merge - -if.false: - br label %merge - -merge: - %side.cond = icmp slt i32 %iv, 50 - br i1 %side.cond, label %backedge, label %side.exit - -backedge: - %iv.next = add i32 %iv, 1 - %loop.cond = icmp slt i32 %iv, 100 - br i1 %loop.cond, label %loop, label %exit - -exit: - ret void - -side.exit: - ret void -} Index: test/CodeGen/AArch64/GlobalISel/unknown-intrinsic.ll =================================================================== --- test/CodeGen/AArch64/GlobalISel/unknown-intrinsic.ll +++ test/CodeGen/AArch64/GlobalISel/unknown-intrinsic.ll @@ -1,10 +1,10 @@ ; RUN: llc -O0 -mtriple=arm64 < %s -declare i8* @llvm.launder.invariant.group(i8*) +declare i8* @llvm.invariant.group.barrier(i8*) define i8* @barrier(i8* %p) { -; CHECK: bl llvm.launder.invariant.group - %q = call i8* @llvm.launder.invariant.group(i8* %p) +; CHECK: bl llvm.invariant.group.barrier + %q = call i8* @llvm.invariant.group.barrier(i8* %p) ret i8* %q } Index: test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll =================================================================== --- test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll +++ test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll @@ -126,3 +126,222 @@ %vrsqrtsh_f16 = tail call half @llvm.aarch64.neon.frsqrts.f16(half %a, half %b) ret half %vrsqrtsh_f16 } + +declare half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32, i32) #1 +declare half @llvm.aarch64.neon.vcvtfxs2fp.f16.i64(i64, i32) #1 +declare i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half, i32) #1 +declare i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f16(half, i32) #1 +declare half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32, i32) #1 +declare i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half, i32) #1 + +define dso_local half @test_vcvth_n_f16_s16_1(i16 %a) { +; CHECK-LABEL: test_vcvth_n_f16_s16_1: +; CHECK: sxth w[[wReg:[0-9]+]], w0 +; CHECK-NEXT: fmov s0, w[[wReg:[0-9]+]] +; CHECK-NEXT: scvtf h0, s0, #1 +; CHECK-NEXT: ret +entry: + %sext = sext i16 %a to i32 + %fcvth_n = tail call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %sext, i32 1) + ret half %fcvth_n +} + +define dso_local half @test_vcvth_n_f16_s16_16(i16 %a) { +; CHECK-LABEL: test_vcvth_n_f16_s16_16: +; CHECK: sxth w[[wReg:[0-9]+]], w0 +; CHECK-NEXT: fmov s0, w[[wReg:[0-9]+]] +; CHECK-NEXT: scvtf h0, s0, #16 +; CHECK-NEXT: ret +entry: + %sext = sext i16 %a to i32 + %fcvth_n = tail call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %sext, i32 16) + ret half %fcvth_n +} + +define dso_local half @test_vcvth_n_f16_s32_1(i32 %a) { +; CHECK-LABEL: test_vcvth_n_f16_s32_1: +; CHECK: fmov s0, w0 +; CHECK-NEXT: scvtf h0, s0, #1 +; CHECK-NEXT: ret +entry: + %vcvth_n_f16_s32 = tail call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %a, i32 1) + ret half %vcvth_n_f16_s32 +} + +define dso_local half @test_vcvth_n_f16_s32_16(i32 %a) { +; CHECK-LABEL: test_vcvth_n_f16_s32_16: +; CHECK: fmov s0, w0 +; CHECK-NEXT: scvtf h0, s0, #16 +; CHECK-NEXT: ret +entry: + %vcvth_n_f16_s32 = tail call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %a, i32 16) + ret half %vcvth_n_f16_s32 +} + +define dso_local half @test_vcvth_n_f16_s64_1(i64 %a) { +; CHECK-LABEL: test_vcvth_n_f16_s64_1: +; CHECK: fmov d0, x0 +; CHECK-NEXT: fcvtzs h0, d0, #1 +; CHECK-NEXT: ret +entry: + %vcvth_n_f16_s64 = tail call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i64(i64 %a, i32 1) + ret half %vcvth_n_f16_s64 +} + +define dso_local half @test_vcvth_n_f16_s64_32(i64 %a) { +; CHECK-LABEL: test_vcvth_n_f16_s64_32: +; CHECK: fmov d0, x0 +; CHECK-NEXT: fcvtzs h0, d0, #32 +; CHECK-NEXT: ret +entry: + %vcvth_n_f16_s64 = tail call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i64(i64 %a, i32 32) + ret half %vcvth_n_f16_s64 +} + +define dso_local i16 @test_vcvth_n_s16_f16_1(half %a) { +; CHECK-LABEL: test_vcvth_n_s16_f16_1: +; CHECK: fcvtzs s0, h0, #1 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvth_n = tail call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half %a, i32 1) + %0 = trunc i32 %fcvth_n to i16 + ret i16 %0 +} + +define dso_local i16 @test_vcvth_n_s16_f16_16(half %a) { +; CHECK-LABEL: test_vcvth_n_s16_f16_16: +; CHECK: fcvtzs s0, h0, #16 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvth_n = tail call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half %a, i32 16) + %0 = trunc i32 %fcvth_n to i16 + ret i16 %0 +} + +define dso_local i32 @test_vcvth_n_s32_f16_1(half %a) { +; CHECK-LABEL: test_vcvth_n_s32_f16_1: +; CHECK: fcvtzs s0, h0, #1 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %vcvth_n_s32_f16 = tail call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half %a, i32 1) + ret i32 %vcvth_n_s32_f16 +} + +define dso_local i32 @test_vcvth_n_s32_f16_16(half %a) { +; CHECK-LABEL: test_vcvth_n_s32_f16_16: +; CHECK: fcvtzs s0, h0, #16 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %vcvth_n_s32_f16 = tail call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half %a, i32 16) + ret i32 %vcvth_n_s32_f16 +} + +define dso_local i64 @test_vcvth_n_s64_f16_1(half %a) { +; CHECK-LABEL: test_vcvth_n_s64_f16_1: +; CHECK: fcvtzs d0, h0, #1 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +entry: + %vcvth_n_s64_f16 = tail call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f16(half %a, i32 1) + ret i64 %vcvth_n_s64_f16 +} + +define dso_local i64 @test_vcvth_n_s64_f16_16(half %a) { +; CHECK-LABEL: test_vcvth_n_s64_f16_16: +; CHECK: fcvtzs d0, h0, #16 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +entry: + %vcvth_n_s64_f16 = tail call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f16(half %a, i32 16) + ret i64 %vcvth_n_s64_f16 +} + +define dso_local half @test_vcvth_n_f16_u16_1(i16 %a) { +; CHECK-LABEL: test_vcvth_n_f16_u16_1: +; CHECK: and w[[wReg:[0-9]+]], w0, #0xffff +; CHECK-NEXT: fmov s0, w[[wReg:[0-9]+]] +; CHECK-NEXT: ucvtf h0, s0, #1 +; CHECK-NEXT: ret +entry: + %0 = zext i16 %a to i32 + %fcvth_n = tail call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %0, i32 1) + ret half %fcvth_n +} + +define dso_local half @test_vcvth_n_f16_u16_16(i16 %a) { +; CHECK-LABEL: test_vcvth_n_f16_u16_16: +; CHECK: and w[[wReg:[0-9]+]], w0, #0xffff +; CHECK-NEXT: fmov s0, w[[wReg:[0-9]+]] +; CHECK-NEXT: ucvtf h0, s0, #16 +; CHECK-NEXT: ret +entry: + %0 = zext i16 %a to i32 + %fcvth_n = tail call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %0, i32 16) + ret half %fcvth_n +} + +define dso_local half @test_vcvth_n_f16_u32_1(i32 %a) { +; CHECK-LABEL: test_vcvth_n_f16_u32_1: +; CHECK: fmov s0, w0 +; CHECK-NEXT: ucvtf h0, s0, #1 +; CHECK-NEXT: ret +entry: + %vcvth_n_f16_u32 = tail call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %a, i32 1) + ret half %vcvth_n_f16_u32 +} + +define dso_local half @test_vcvth_n_f16_u32_16(i32 %a) { +; CHECK-LABEL: test_vcvth_n_f16_u32_16: +; CHECK: fmov s0, w0 +; CHECK-NEXT: ucvtf h0, s0, #16 +; CHECK-NEXT: ret +entry: + %vcvth_n_f16_u32 = tail call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %a, i32 16) + ret half %vcvth_n_f16_u32 +} + +define dso_local i16 @test_vcvth_n_u16_f16_1(half %a) { +; CHECK-LABEL: test_vcvth_n_u16_f16_1: +; CHECK: fcvtzu s0, h0, #1 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvth_n = tail call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half %a, i32 1) + %0 = trunc i32 %fcvth_n to i16 + ret i16 %0 +} + +define dso_local i16 @test_vcvth_n_u16_f16_16(half %a) { +; CHECK-LABEL: test_vcvth_n_u16_f16_16: +; CHECK: fcvtzu s0, h0, #16 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvth_n = tail call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half %a, i32 16) + %0 = trunc i32 %fcvth_n to i16 + ret i16 %0 +} + +define dso_local i32 @test_vcvth_n_u32_f16_1(half %a) { +; CHECK-LABEL: test_vcvth_n_u32_f16_1: +; CHECK: fcvtzu s0, h0, #1 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %vcvth_n_u32_f16 = tail call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half %a, i32 1) + ret i32 %vcvth_n_u32_f16 +} + +define dso_local i32 @test_vcvth_n_u32_f16_16(half %a) { +; CHECK-LABEL: test_vcvth_n_u32_f16_16: +; CHECK: fcvtzu s0, h0, #16 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %vcvth_n_u32_f16 = tail call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half %a, i32 16) + ret i32 %vcvth_n_u32_f16 +} Index: test/CodeGen/AMDGPU/combine-cond-add-sub.ll =================================================================== --- test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -1,13 +1,10 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}add1: ; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, 0, v{{[0-9]+}}, [[CC]] ; GCN-NOT: v_cndmask -; GFX9-LABEL: {{^}}add1: -; GFX9: v_addc_co_u32_e{{32|64}} v{{[0-9]+}}, vcc define amdgpu_kernel void @add1(i32 addrspace(1)* nocapture %arg) { bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -21,33 +18,11 @@ ret void } -; GCN-LABEL: {{^}}add1_i16: -; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, 0, v{{[0-9]+}}, [[CC]] -; GCN-NOT: v_cndmask - -; GFX9-LABEL: {{^}}add1_i16: -; GFX9: v_addc_co_u32_e{{32|64}} v{{[0-9]+}}, vcc -define i16 @add1_i16(i32 addrspace(1)* nocapture %arg, i16 addrspace(1)* nocapture %dst) { -bb: - %x = tail call i32 @llvm.amdgcn.workitem.id.x() - %y = tail call i32 @llvm.amdgcn.workitem.id.y() - %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x - %v = load i32, i32 addrspace(1)* %gep, align 4 - %cmp = icmp ugt i32 %x, %y - %ext = zext i1 %cmp to i32 - %add = add i32 %v, %ext - %trunc = trunc i32 %add to i16 - ret i16 %trunc -} - ; GCN-LABEL: {{^}}sub1: ; GCN: v_cmp_gt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_subbrev_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc ; GCN-NOT: v_cndmask -; GFX9-LABEL: {{^}}sub1: -; GFX9: v_subbrev_co_u32_e{{32|64}} v{{[0-9]+}}, vcc define amdgpu_kernel void @sub1(i32 addrspace(1)* nocapture %arg) { bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -67,8 +42,6 @@ ; GCN-NOT: v_cndmask ; GCN-NOT: v_add -; GFX9-LABEL: {{^}}add_adde: -; GFX9: v_addc_co_u32_e{{32|64}} v{{[0-9]+}}, vcc define amdgpu_kernel void @add_adde(i32 addrspace(1)* nocapture %arg, i32 %a) { bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -89,8 +62,6 @@ ; GCN-NOT: v_cndmask ; GCN-NOT: v_add -; GFX9-LABEL: {{^}}adde_add: -; GFX9: v_addc_co_u32_e{{32|64}} v{{[0-9]+}}, vcc define amdgpu_kernel void @adde_add(i32 addrspace(1)* nocapture %arg, i32 %a) { bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -111,8 +82,6 @@ ; GCN-NOT: v_cndmask ; GCN-NOT: v_sub -; GFX9-LABEL: {{^}}sub_sube: -; GFX9: v_subb_co_u32_e{{32|64}} v{{[0-9]+}}, vcc define amdgpu_kernel void @sub_sube(i32 addrspace(1)* nocapture %arg, i32 %a) { bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -133,8 +102,6 @@ ; GCN-NOT: v_cndmask ; GCN-NOT: v_sub -; GFX9-LABEL: {{^}}sube_sub: -; GFX9: v_subb_co_u32_e{{32|64}} v{{[0-9]+}}, vcc define amdgpu_kernel void @sube_sub(i32 addrspace(1)* nocapture %arg, i32 %a) { bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -154,8 +121,6 @@ ; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, 0, v{{[0-9]+}}, [[CC]] ; GCN-NOT: v_cndmask -; GFX9-LABEL: {{^}}zext_flclass: -; GFX9: v_addc_co_u32_e{{32|64}} v{{[0-9]+}}, vcc define amdgpu_kernel void @zext_flclass(i32 addrspace(1)* nocapture %arg, float %x) { bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -173,8 +138,6 @@ ; GCN: v_subbrev_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc ; GCN-NOT: v_cndmask -; GFX9-LABEL: {{^}}sext_flclass: -; GFX9: v_subbrev_co_u32_e32 v{{[0-9]+}}, vcc define amdgpu_kernel void @sext_flclass(i32 addrspace(1)* nocapture %arg, float %x) { bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -192,8 +155,6 @@ ; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, 0, v{{[0-9]+}}, [[CC]] ; GCN-NOT: v_cndmask -; GFX9-LABEL: {{^}}add_and: -; GFX9: v_addc_co_u32_e{{32|64}} v{{[0-9]+}}, vcc define amdgpu_kernel void @add_and(i32 addrspace(1)* nocapture %arg) { bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll +++ test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll @@ -3,7 +3,7 @@ declare {}* @llvm.invariant.start.p5i8(i64, i8 addrspace(5)* nocapture) #0 declare void @llvm.invariant.end.p5i8({}*, i64, i8 addrspace(5)* nocapture) #0 -declare i8 addrspace(5)* @llvm.launder.invariant.group.p5i8(i8 addrspace(5)*) #1 +declare i8 addrspace(5)* @llvm.invariant.group.barrier.p5i8(i8 addrspace(5)*) #1 ; GCN-LABEL: {{^}}use_invariant_promotable_lds: ; GCN: buffer_load_dword @@ -17,7 +17,7 @@ store i32 %tmp3, i32 addrspace(5)* %tmp %tmp4 = call {}* @llvm.invariant.start.p5i8(i64 4, i8 addrspace(5)* %tmp1) #0 call void @llvm.invariant.end.p5i8({}* %tmp4, i64 4, i8 addrspace(5)* %tmp1) #0 - %tmp5 = call i8 addrspace(5)* @llvm.launder.invariant.group.p5i8(i8 addrspace(5)* %tmp1) #1 + %tmp5 = call i8 addrspace(5)* @llvm.invariant.group.barrier.p5i8(i8 addrspace(5)* %tmp1) #1 ret void } Index: test/CodeGen/ARM/vuzp.ll =================================================================== --- test/CodeGen/ARM/vuzp.ll +++ test/CodeGen/ARM/vuzp.ll @@ -459,32 +459,35 @@ define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1, ; CHECK-LABEL: vuzp_wide_type: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: add r12, sp, #32 -; CHECK-NEXT: add lr, sp, #48 +; CHECK-NEXT: .save {r4, r10, r11, lr} +; CHECK-NEXT: push {r4, r10, r11, lr} +; CHECK-NEXT: .setfp r11, sp, #8 +; CHECK-NEXT: add r11, sp, #8 +; CHECK-NEXT: bic sp, sp, #15 +; CHECK-NEXT: add r12, r11, #32 +; CHECK-NEXT: add lr, r11, #44 ; CHECK-NEXT: vld1.32 {d17[0]}, [r12:32] -; CHECK-NEXT: add r12, sp, #24 +; CHECK-NEXT: add r12, r11, #24 ; CHECK-NEXT: vld1.32 {d16[0]}, [r12:32] -; CHECK-NEXT: add r12, sp, #56 +; CHECK-NEXT: add r12, r11, #52 ; CHECK-NEXT: vld1.32 {d19[0]}, [r12:32] -; CHECK-NEXT: ldr r12, [sp, #68] +; CHECK-NEXT: ldr r12, [r11, #64] ; CHECK-NEXT: vld1.32 {d18[0]}, [lr:32] -; CHECK-NEXT: add lr, sp, #40 +; CHECK-NEXT: add lr, r11, #40 ; CHECK-NEXT: vld1.32 {d20[0]}, [lr:32] ; CHECK-NEXT: ldr r4, [r12] ; CHECK-NEXT: vmov.32 d23[0], r4 -; CHECK-NEXT: add r4, sp, #64 +; CHECK-NEXT: add r4, r11, #60 ; CHECK-NEXT: vld1.32 {d24[0]}, [r4:32] -; CHECK-NEXT: add r4, sp, #36 +; CHECK-NEXT: add r4, r11, #36 ; CHECK-NEXT: vld1.32 {d17[1]}, [r4:32] -; CHECK-NEXT: add r4, sp, #28 +; CHECK-NEXT: add r4, r11, #28 ; CHECK-NEXT: vcgt.u32 q10, q12, q10 ; CHECK-NEXT: vmov.u8 lr, d23[3] ; CHECK-NEXT: vld1.32 {d16[1]}, [r4:32] -; CHECK-NEXT: add r4, sp, #60 +; CHECK-NEXT: add r4, r11, #56 ; CHECK-NEXT: vld1.32 {d19[1]}, [r4:32] -; CHECK-NEXT: add r4, sp, #52 +; CHECK-NEXT: add r4, r11, #48 ; CHECK-NEXT: vld1.32 {d18[1]}, [r4:32] ; CHECK-NEXT: add r4, r12, #4 ; CHECK-NEXT: vcgt.u32 q8, q9, q8 @@ -497,7 +500,7 @@ ; CHECK-NEXT: vneg.s8 q9, q9 ; CHECK-NEXT: vtbl.8 d16, {d22, d23}, d20 ; CHECK-NEXT: vld1.8 {d17[1]}, [r4] -; CHECK-NEXT: add r4, sp, #8 +; CHECK-NEXT: add r4, r11, #8 ; CHECK-NEXT: vshl.i8 q8, q8, #7 ; CHECK-NEXT: vld1.64 {d20, d21}, [r4] ; CHECK-NEXT: vshl.s8 q8, q8, q9 @@ -506,7 +509,8 @@ ; CHECK-NEXT: vbsl q8, q9, q10 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: pop {r4, lr} +; CHECK-NEXT: sub sp, r11, #8 +; CHECK-NEXT: pop {r4, r10, r11, lr} ; CHECK-NEXT: mov pc, lr ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: Index: test/CodeGen/Generic/intrinsics.ll =================================================================== --- test/CodeGen/Generic/intrinsics.ll +++ test/CodeGen/Generic/intrinsics.ll @@ -39,10 +39,10 @@ ret double %I } -declare i8* @llvm.launder.invariant.group(i8*) +declare i8* @llvm.invariant.group.barrier(i8*) define i8* @barrier(i8* %p) { - %q = call i8* @llvm.launder.invariant.group(i8* %p) + %q = call i8* @llvm.invariant.group.barrier(i8* %p) ret i8* %q } Index: test/CodeGen/X86/3dnow-schedule.ll =================================================================== --- test/CodeGen/X86/3dnow-schedule.ll +++ test/CodeGen/X86/3dnow-schedule.ll @@ -15,7 +15,7 @@ ; CHECK-LABEL: test_pavgusb: ; CHECK: # %bb.0: ; CHECK-NEXT: pavgusb %mm1, %mm0 # sched: [5:1.00] -; CHECK-NEXT: pavgusb (%rdi), %mm0 # sched: [11:1.00] +; CHECK-NEXT: pavgusb (%rdi), %mm0 # sched: [10:1.00] ; CHECK-NEXT: movq %mm0, %rax # sched: [1:0.33] ; CHECK-NEXT: retq # sched: [1:1.00] %1 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %a0, x86_mmx %a1) @@ -345,7 +345,7 @@ ; CHECK-LABEL: test_pmulhrw: ; CHECK: # %bb.0: ; CHECK-NEXT: pmulhrw %mm1, %mm0 # sched: [5:1.00] -; CHECK-NEXT: pmulhrw (%rdi), %mm0 # sched: [11:1.00] +; CHECK-NEXT: pmulhrw (%rdi), %mm0 # sched: [10:1.00] ; CHECK-NEXT: movq %mm0, %rax # sched: [1:0.33] ; CHECK-NEXT: retq # sched: [1:1.00] %1 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %a0, x86_mmx %a1) Index: test/CodeGen/X86/avx2-schedule.ll =================================================================== --- test/CodeGen/X86/avx2-schedule.ll +++ test/CodeGen/X86/avx2-schedule.ll @@ -10,7 +10,7 @@ ; GENERIC-LABEL: test_broadcasti128: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [6:1.00] -; GENERIC-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_broadcasti128: @@ -174,8 +174,8 @@ define <4 x i32> @test_extracti128(<8 x i32> %a0, <8 x i32> %a1, <4 x i32> *%a2) { ; GENERIC-LABEL: test_extracti128: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50] -; GENERIC-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm2 # sched: [3:1.00] +; GENERIC-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: vextracti128 $1, %ymm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] @@ -523,8 +523,8 @@ ; GENERIC-LABEL: test_inserti128: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00] -; GENERIC-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; GENERIC-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_inserti128: @@ -609,7 +609,7 @@ ; GENERIC-LABEL: test_mpsadbw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [7:1.00] -; GENERIC-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [14:1.00] +; GENERIC-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [13:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_mpsadbw: @@ -652,8 +652,8 @@ define <32 x i8> @test_pabsb(<32 x i8> %a0, <32 x i8> *%a1) { ; GENERIC-LABEL: test_pabsb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpabsb (%rdi), %ymm1 # sched: [8:0.50] +; GENERIC-NEXT: vpabsb %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpabsb (%rdi), %ymm1 # sched: [8:1.00] ; GENERIC-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -702,8 +702,8 @@ define <8 x i32> @test_pabsd(<8 x i32> %a0, <8 x i32> *%a1) { ; GENERIC-LABEL: test_pabsd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpabsd (%rdi), %ymm1 # sched: [8:0.50] +; GENERIC-NEXT: vpabsd %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpabsd (%rdi), %ymm1 # sched: [8:1.00] ; GENERIC-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -752,8 +752,8 @@ define <16 x i16> @test_pabsw(<16 x i16> %a0, <16 x i16> *%a1) { ; GENERIC-LABEL: test_pabsw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpabsw (%rdi), %ymm1 # sched: [8:0.50] +; GENERIC-NEXT: vpabsw %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpabsw (%rdi), %ymm1 # sched: [8:1.00] ; GENERIC-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -803,7 +803,7 @@ ; GENERIC-LABEL: test_packssdw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_packssdw: @@ -847,7 +847,7 @@ ; GENERIC-LABEL: test_packsswb: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_packsswb: @@ -891,7 +891,7 @@ ; GENERIC-LABEL: test_packusdw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_packusdw: @@ -935,7 +935,7 @@ ; GENERIC-LABEL: test_packuswb: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_packuswb: @@ -978,8 +978,8 @@ define <32 x i8> @test_paddb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_paddb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddb: @@ -1020,8 +1020,8 @@ define <8 x i32> @test_paddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; GENERIC-LABEL: test_paddd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddd: @@ -1062,8 +1062,8 @@ define <4 x i64> @test_paddq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; GENERIC-LABEL: test_paddq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddq: @@ -1104,8 +1104,8 @@ define <32 x i8> @test_paddsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_paddsb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddsb: @@ -1147,8 +1147,8 @@ define <16 x i16> @test_paddsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_paddsw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddsw: @@ -1190,8 +1190,8 @@ define <32 x i8> @test_paddusb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_paddusb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddusb: @@ -1233,8 +1233,8 @@ define <16 x i16> @test_paddusw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_paddusw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddusw: @@ -1276,8 +1276,8 @@ define <16 x i16> @test_paddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_paddw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddw: @@ -1320,7 +1320,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00] ; GENERIC-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] sched: [1:1.00] -; GENERIC-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_palignr: @@ -1369,7 +1369,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; GENERIC-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50] -; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pand: @@ -1418,7 +1418,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; GENERIC-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50] -; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pandn: @@ -1467,8 +1467,8 @@ define <32 x i8> @test_pavgb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_pavgb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpavgb %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pavgb: @@ -1519,8 +1519,8 @@ define <16 x i16> @test_pavgw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_pavgw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpavgw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pavgw: @@ -1621,8 +1621,8 @@ ; GENERIC-LABEL: test_pblendd_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.50] -; GENERIC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [8:0.50] -; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [7:0.50] +; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pblendd_ymm: @@ -1670,7 +1670,7 @@ ; GENERIC-LABEL: test_pblendvb: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; GENERIC-NEXT: vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [9:1.00] +; GENERIC-NEXT: vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pblendvb: @@ -1713,8 +1713,8 @@ ; GENERIC-LABEL: test_pblendw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:0.50] -; GENERIC-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [8:0.50] -; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [7:0.50] +; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pblendw: @@ -1812,7 +1812,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpbroadcastb %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpbroadcastb (%rdi), %ymm1 # sched: [6:1.00] -; GENERIC-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pbroadcastb_ymm: @@ -1910,7 +1910,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpbroadcastd %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpbroadcastd (%rdi), %ymm1 # sched: [6:1.00] -; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pbroadcastd_ymm: @@ -2008,7 +2008,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpbroadcastq %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpbroadcastq (%rdi), %ymm1 # sched: [6:1.00] -; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pbroadcastq_ymm: @@ -2106,7 +2106,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpbroadcastw %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpbroadcastw (%rdi), %ymm1 # sched: [6:1.00] -; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pbroadcastw_ymm: @@ -2153,8 +2153,8 @@ define <32 x i8> @test_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_pcmpeqb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpeqb: @@ -2197,8 +2197,8 @@ define <8 x i32> @test_pcmpeqd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; GENERIC-LABEL: test_pcmpeqd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpeqd: @@ -2241,8 +2241,8 @@ define <4 x i64> @test_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; GENERIC-LABEL: test_pcmpeqq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpeqq: @@ -2285,8 +2285,8 @@ define <16 x i16> @test_pcmpeqw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_pcmpeqw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpeqw: @@ -2329,8 +2329,8 @@ define <32 x i8> @test_pcmpgtb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_pcmpgtb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpgtb: @@ -2373,8 +2373,8 @@ define <8 x i32> @test_pcmpgtd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; GENERIC-LABEL: test_pcmpgtd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpgtd: @@ -2417,8 +2417,8 @@ define <4 x i64> @test_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; GENERIC-LABEL: test_pcmpgtq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpgtq: @@ -2461,8 +2461,8 @@ define <16 x i16> @test_pcmpgtw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_pcmpgtw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpgtw: @@ -2506,8 +2506,8 @@ ; GENERIC-LABEL: test_perm2i128: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] -; GENERIC-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [6:1.00] +; GENERIC-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_perm2i128: @@ -2555,8 +2555,8 @@ ; GENERIC-LABEL: test_permd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpermd %ymm1, %ymm0, %ymm1 # sched: [1:1.00] -; GENERIC-NEXT: vpermd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; GENERIC-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpermd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_permd: @@ -2605,7 +2605,7 @@ ; GENERIC-LABEL: test_permpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [1:1.00] -; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [6:1.00] ; GENERIC-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2654,7 +2654,7 @@ ; GENERIC-LABEL: test_permps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpermps %ymm1, %ymm0, %ymm1 # sched: [1:1.00] -; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2704,8 +2704,8 @@ ; GENERIC-LABEL: test_permq: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [1:1.00] -; GENERIC-NEXT: vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [8:1.00] -; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [6:1.00] +; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_permq: @@ -3039,7 +3039,7 @@ ; GENERIC-LABEL: test_phaddd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.50] -; GENERIC-NEXT: vphaddd (%rdi), %ymm0, %ymm0 # sched: [10:1.50] +; GENERIC-NEXT: vphaddd (%rdi), %ymm0, %ymm0 # sched: [9:1.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_phaddd: @@ -3082,7 +3082,7 @@ ; GENERIC-LABEL: test_phaddsw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:1.50] -; GENERIC-NEXT: vphaddsw (%rdi), %ymm0, %ymm0 # sched: [10:1.50] +; GENERIC-NEXT: vphaddsw (%rdi), %ymm0, %ymm0 # sched: [9:1.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_phaddsw: @@ -3125,7 +3125,7 @@ ; GENERIC-LABEL: test_phaddw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:1.50] -; GENERIC-NEXT: vphaddw (%rdi), %ymm0, %ymm0 # sched: [10:1.50] +; GENERIC-NEXT: vphaddw (%rdi), %ymm0, %ymm0 # sched: [9:1.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_phaddw: @@ -3168,7 +3168,7 @@ ; GENERIC-LABEL: test_phsubd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:1.50] -; GENERIC-NEXT: vphsubd (%rdi), %ymm0, %ymm0 # sched: [10:1.50] +; GENERIC-NEXT: vphsubd (%rdi), %ymm0, %ymm0 # sched: [9:1.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_phsubd: @@ -3211,7 +3211,7 @@ ; GENERIC-LABEL: test_phsubsw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:1.50] -; GENERIC-NEXT: vphsubsw (%rdi), %ymm0, %ymm0 # sched: [10:1.50] +; GENERIC-NEXT: vphsubsw (%rdi), %ymm0, %ymm0 # sched: [9:1.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_phsubsw: @@ -3254,7 +3254,7 @@ ; GENERIC-LABEL: test_phsubw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:1.50] -; GENERIC-NEXT: vphsubw (%rdi), %ymm0, %ymm0 # sched: [10:1.50] +; GENERIC-NEXT: vphsubw (%rdi), %ymm0, %ymm0 # sched: [9:1.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_phsubw: @@ -3297,7 +3297,7 @@ ; GENERIC-LABEL: test_pmaddubsw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; GENERIC-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaddubsw: @@ -3341,7 +3341,7 @@ ; GENERIC-LABEL: test_pmaddwd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; GENERIC-NEXT: vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaddwd: @@ -3580,8 +3580,8 @@ define <32 x i8> @test_pmaxsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_pmaxsb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaxsb: @@ -3623,8 +3623,8 @@ define <8 x i32> @test_pmaxsd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; GENERIC-LABEL: test_pmaxsd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaxsd: @@ -3666,8 +3666,8 @@ define <16 x i16> @test_pmaxsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_pmaxsw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaxsw: @@ -3709,8 +3709,8 @@ define <32 x i8> @test_pmaxub(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_pmaxub: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaxub: @@ -3752,8 +3752,8 @@ define <8 x i32> @test_pmaxud(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; GENERIC-LABEL: test_pmaxud: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaxud: @@ -3795,8 +3795,8 @@ define <16 x i16> @test_pmaxuw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_pmaxuw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaxuw: @@ -3838,8 +3838,8 @@ define <32 x i8> @test_pminsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_pminsb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpminsb %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pminsb: @@ -3881,8 +3881,8 @@ define <8 x i32> @test_pminsd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; GENERIC-LABEL: test_pminsd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpminsd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pminsd: @@ -3924,8 +3924,8 @@ define <16 x i16> @test_pminsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_pminsw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpminsw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pminsw: @@ -3967,8 +3967,8 @@ define <32 x i8> @test_pminub(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_pminub: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpminub %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpminub (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pminub: @@ -4010,8 +4010,8 @@ define <8 x i32> @test_pminud(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; GENERIC-LABEL: test_pminud: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpminud %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpminud (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pminud: @@ -4053,8 +4053,8 @@ define <16 x i16> @test_pminuw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_pminuw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpminuw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pminuw: @@ -4139,7 +4139,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxbd %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxbd (%rdi), %ymm1 # sched: [6:1.00] -; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovsxbd: @@ -4190,7 +4190,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxbq %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxbq (%rdi), %ymm1 # sched: [6:1.00] -; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovsxbq: @@ -4241,7 +4241,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxbw (%rdi), %ymm1 # sched: [6:1.00] -; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovsxbw: @@ -4290,7 +4290,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxdq (%rdi), %ymm1 # sched: [6:1.00] -; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovsxdq: @@ -4339,7 +4339,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxwd (%rdi), %ymm1 # sched: [6:1.00] -; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovsxwd: @@ -4388,7 +4388,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxwq %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxwq (%rdi), %ymm1 # sched: [6:1.00] -; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovsxwq: @@ -4439,7 +4439,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [6:1.00] -; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovzxbd: @@ -4490,7 +4490,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00] -; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovzxbq: @@ -4541,7 +4541,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [6:1.00] -; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovzxbw: @@ -4590,7 +4590,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00] ; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00] -; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovzxdq: @@ -4639,7 +4639,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] ; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00] -; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovzxwd: @@ -4688,7 +4688,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00] -; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovzxwq: @@ -4738,7 +4738,7 @@ ; GENERIC-LABEL: test_pmuldq: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; GENERIC-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmuldq: @@ -4782,7 +4782,7 @@ ; GENERIC-LABEL: test_pmulhrsw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; GENERIC-NEXT: vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmulhrsw: @@ -4825,7 +4825,7 @@ ; GENERIC-LABEL: test_pmulhuw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; GENERIC-NEXT: vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmulhuw: @@ -4868,7 +4868,7 @@ ; GENERIC-LABEL: test_pmulhw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmulhw (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; GENERIC-NEXT: vpmulhw (%rdi), %ymm0, %ymm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmulhw: @@ -4911,7 +4911,7 @@ ; GENERIC-LABEL: test_pmulld: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; GENERIC-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [11:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmulld: @@ -4953,7 +4953,7 @@ ; GENERIC-LABEL: test_pmullw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; GENERIC-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmullw: @@ -4995,7 +4995,7 @@ ; GENERIC-LABEL: test_pmuludq: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; GENERIC-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmuludq: @@ -5040,7 +5040,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; GENERIC-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50] -; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_por: @@ -5088,7 +5088,7 @@ ; GENERIC-LABEL: test_psadbw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vpsadbw (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; GENERIC-NEXT: vpsadbw (%rdi), %ymm0, %ymm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psadbw: @@ -5132,7 +5132,7 @@ ; GENERIC-LABEL: test_pshufb: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpshufb (%rdi), %ymm0, %ymm0 # sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pshufb: @@ -5175,8 +5175,8 @@ ; GENERIC-LABEL: test_pshufd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00] -; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:1.00] -; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [6:1.00] +; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pshufd: @@ -5224,7 +5224,7 @@ ; GENERIC-LABEL: test_pshufhw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00] -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [6:1.00] ; GENERIC-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5273,7 +5273,7 @@ ; GENERIC-LABEL: test_pshuflw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00] -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5321,8 +5321,8 @@ define <32 x i8> @test_psignb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_psignb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpsignb %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psignb: @@ -5364,8 +5364,8 @@ define <8 x i32> @test_psignd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; GENERIC-LABEL: test_psignd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpsignd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psignd: @@ -5407,8 +5407,8 @@ define <16 x i16> @test_psignw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_psignw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpsignw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psignw: @@ -5450,8 +5450,8 @@ define <8 x i32> @test_pslld(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; GENERIC-LABEL: test_pslld: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00] -; GENERIC-NEXT: vpslld (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; GENERIC-NEXT: vpslld %xmm1, %ymm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vpslld (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: vpslld $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5534,8 +5534,8 @@ define <4 x i64> @test_psllq(<4 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; GENERIC-LABEL: test_psllq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00] -; GENERIC-NEXT: vpsllq (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; GENERIC-NEXT: vpsllq %xmm1, %ymm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vpsllq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: vpsllq $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5585,7 +5585,7 @@ ; GENERIC-LABEL: test_psllvd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsllvd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpsllvd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psllvd: @@ -5628,7 +5628,7 @@ ; GENERIC-LABEL: test_psllvd_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsllvd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpsllvd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psllvd_ymm: @@ -5671,7 +5671,7 @@ ; GENERIC-LABEL: test_psllvq: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsllvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpsllvq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psllvq: @@ -5714,7 +5714,7 @@ ; GENERIC-LABEL: test_psllvq_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpsllvq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psllvq_ymm: @@ -5756,8 +5756,8 @@ define <16 x i16> @test_psllw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_psllw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00] -; GENERIC-NEXT: vpsllw (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; GENERIC-NEXT: vpsllw %xmm1, %ymm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vpsllw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: vpsllw $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5806,8 +5806,8 @@ define <8 x i32> @test_psrad(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; GENERIC-LABEL: test_psrad: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00] -; GENERIC-NEXT: vpsrad (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; GENERIC-NEXT: vpsrad %xmm1, %ymm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vpsrad (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: vpsrad $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5857,7 +5857,7 @@ ; GENERIC-LABEL: test_psravd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsravd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpsravd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psravd: @@ -5900,7 +5900,7 @@ ; GENERIC-LABEL: test_psravd_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsravd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpsravd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psravd_ymm: @@ -5942,8 +5942,8 @@ define <16 x i16> @test_psraw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_psraw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00] -; GENERIC-NEXT: vpsraw (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; GENERIC-NEXT: vpsraw %xmm1, %ymm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vpsraw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: vpsraw $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5992,8 +5992,8 @@ define <8 x i32> @test_psrld(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; GENERIC-LABEL: test_psrld: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00] -; GENERIC-NEXT: vpsrld (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; GENERIC-NEXT: vpsrld %xmm1, %ymm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vpsrld (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: vpsrld $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6076,8 +6076,8 @@ define <4 x i64> @test_psrlq(<4 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; GENERIC-LABEL: test_psrlq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00] -; GENERIC-NEXT: vpsrlq (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; GENERIC-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vpsrlq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: vpsrlq $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6127,7 +6127,7 @@ ; GENERIC-LABEL: test_psrlvd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psrlvd: @@ -6170,7 +6170,7 @@ ; GENERIC-LABEL: test_psrlvd_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psrlvd_ymm: @@ -6213,7 +6213,7 @@ ; GENERIC-LABEL: test_psrlvq: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psrlvq: @@ -6256,7 +6256,7 @@ ; GENERIC-LABEL: test_psrlvq_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psrlvq_ymm: @@ -6298,8 +6298,8 @@ define <16 x i16> @test_psrlw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; GENERIC-LABEL: test_psrlw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00] -; GENERIC-NEXT: vpsrlw (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; GENERIC-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vpsrlw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: vpsrlw $2, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6348,8 +6348,8 @@ define <32 x i8> @test_psubb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_psubb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubb: @@ -6390,8 +6390,8 @@ define <8 x i32> @test_psubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; GENERIC-LABEL: test_psubd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubd: @@ -6432,8 +6432,8 @@ define <4 x i64> @test_psubq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; GENERIC-LABEL: test_psubq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubq: @@ -6474,8 +6474,8 @@ define <32 x i8> @test_psubsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_psubsb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubsb: @@ -6517,8 +6517,8 @@ define <16 x i16> @test_psubsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_psubsw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubsw: @@ -6560,8 +6560,8 @@ define <32 x i8> @test_psubusb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; GENERIC-LABEL: test_psubusb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubusb: @@ -6603,8 +6603,8 @@ define <16 x i16> @test_psubusw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_psubusw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubusw: @@ -6646,8 +6646,8 @@ define <16 x i16> @test_psubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; GENERIC-LABEL: test_psubw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; GENERIC-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubw: @@ -6689,7 +6689,7 @@ ; GENERIC-LABEL: test_punpckhbw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00] -; GENERIC-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:1.00] +; GENERIC-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpckhbw: @@ -6731,9 +6731,9 @@ ; GENERIC-LABEL: test_punpckhdq: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] -; GENERIC-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50] -; GENERIC-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [3:1.00] +; GENERIC-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpckhdq: @@ -6786,8 +6786,8 @@ ; GENERIC-LABEL: test_punpckhqdq: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] -; GENERIC-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] -; GENERIC-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [6:1.00] +; GENERIC-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpckhqdq: @@ -6835,7 +6835,7 @@ ; GENERIC-LABEL: test_punpckhwd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00] -; GENERIC-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:1.00] +; GENERIC-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpckhwd: @@ -6877,7 +6877,7 @@ ; GENERIC-LABEL: test_punpcklbw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00] -; GENERIC-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:1.00] +; GENERIC-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpcklbw: @@ -6919,9 +6919,9 @@ ; GENERIC-LABEL: test_punpckldq: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] -; GENERIC-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50] -; GENERIC-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [3:1.00] +; GENERIC-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpckldq: @@ -6974,8 +6974,8 @@ ; GENERIC-LABEL: test_punpcklqdq: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] -; GENERIC-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] -; GENERIC-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [6:1.00] +; GENERIC-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpcklqdq: @@ -7023,7 +7023,7 @@ ; GENERIC-LABEL: test_punpcklwd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00] -; GENERIC-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:1.00] +; GENERIC-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpcklwd: @@ -7066,7 +7066,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; GENERIC-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50] -; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pxor: Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -3404,8 +3404,8 @@ define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3 ; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 @@ -3424,8 +3424,8 @@ define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3 ; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 Index: test/CodeGen/X86/avx512-schedule.ll =================================================================== --- test/CodeGen/X86/avx512-schedule.ll +++ test/CodeGen/X86/avx512-schedule.ll @@ -291,7 +291,7 @@ define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone { ; GENERIC-LABEL: vpaddq_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddq %zmm1, %zmm0, %zmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddq %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddq_test: @@ -305,7 +305,7 @@ define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind { ; GENERIC-LABEL: vpaddq_fold_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddq (%rdi), %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddq (%rdi), %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddq_fold_test: @@ -320,7 +320,7 @@ define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind { ; GENERIC-LABEL: vpaddq_broadcast_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddq_broadcast_test: @@ -334,7 +334,7 @@ define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind { ; GENERIC-LABEL: vpaddq_broadcast2_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddq_broadcast2_test: @@ -357,7 +357,7 @@ define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone { ; GENERIC-LABEL: vpaddd_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_test: @@ -371,7 +371,7 @@ define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind { ; GENERIC-LABEL: vpaddd_fold_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_fold_test: @@ -386,7 +386,7 @@ define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind { ; GENERIC-LABEL: vpaddd_broadcast_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_broadcast_test: @@ -401,7 +401,7 @@ ; GENERIC-LABEL: vpaddd_mask_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [1:0.50] +; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_mask_test: @@ -419,7 +419,7 @@ ; GENERIC-LABEL: vpaddd_maskz_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.50] +; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_maskz_test: @@ -437,7 +437,7 @@ ; GENERIC-LABEL: vpaddd_mask_fold_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [8:0.50] +; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_mask_fold_test: @@ -456,7 +456,7 @@ ; GENERIC-LABEL: vpaddd_mask_broadcast_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [8:0.50] +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_mask_broadcast_test: @@ -474,7 +474,7 @@ ; GENERIC-LABEL: vpaddd_maskz_fold_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50] +; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_maskz_fold_test: @@ -493,7 +493,7 @@ ; GENERIC-LABEL: vpaddd_maskz_broadcast_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50] +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_maskz_broadcast_test: @@ -510,7 +510,7 @@ define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone { ; GENERIC-LABEL: vpsubq_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsubq %zmm1, %zmm0, %zmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpsubq %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpsubq_test: @@ -524,7 +524,7 @@ define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone { ; GENERIC-LABEL: vpsubd_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpsubd %zmm1, %zmm0, %zmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpsubd %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpsubd_test: @@ -643,7 +643,7 @@ define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind { ; GENERIC-LABEL: addq_broadcast: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: addq_broadcast: @@ -2828,7 +2828,7 @@ ; GENERIC-LABEL: ubto8f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0 # sched: [8:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2847,7 +2847,7 @@ ; GENERIC-LABEL: ubto8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: vpsrld $31, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2957,7 +2957,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x8mem_to_8x16: @@ -2977,7 +2977,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x8mem_to_8x16: @@ -2998,7 +2998,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x8mem_to_16x16: @@ -3018,7 +3018,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x8mem_to_16x16: @@ -3104,7 +3104,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32x8mem_to_32x16: @@ -3124,7 +3124,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_32x8mem_to_32x16: @@ -3210,7 +3210,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x8mem_to_4x32: @@ -3230,7 +3230,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x8mem_to_4x32: @@ -3250,7 +3250,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x8mem_to_8x32: @@ -3270,7 +3270,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x8mem_to_8x32: @@ -3290,7 +3290,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x8mem_to_16x32: @@ -3310,7 +3310,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x8mem_to_16x32: @@ -3396,7 +3396,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_2x8mem_to_2x64: @@ -3415,7 +3415,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_2x8mem_to_2x64mask: @@ -3449,7 +3449,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x8mem_to_4x64: @@ -3469,7 +3469,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x8mem_to_4x64mask: @@ -3504,7 +3504,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x8mem_to_8x64: @@ -3524,7 +3524,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x8mem_to_8x64mask: @@ -3542,7 +3542,7 @@ define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone { ; GENERIC-LABEL: sext_8x8mem_to_8x64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x8mem_to_8x64: @@ -3559,7 +3559,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x16mem_to_4x32: @@ -3579,7 +3579,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x16mem_to_4x32mask: @@ -3615,7 +3615,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x16mem_to_8x32: @@ -3635,7 +3635,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x16mem_to_8x32mask: @@ -3703,7 +3703,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x16mem_to_16x32: @@ -3723,7 +3723,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x16mem_to_16x32mask: @@ -3741,7 +3741,7 @@ define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone { ; GENERIC-LABEL: sext_16x16mem_to_16x32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x16mem_to_16x32: @@ -3790,7 +3790,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_2x16mem_to_2x64: @@ -3810,7 +3810,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_2x16mem_to_2x64mask: @@ -3845,7 +3845,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x16mem_to_4x64: @@ -3865,7 +3865,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x16mem_to_4x64mask: @@ -3900,7 +3900,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x16mem_to_8x64: @@ -3920,7 +3920,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x16mem_to_8x64mask: @@ -3938,7 +3938,7 @@ define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone { ; GENERIC-LABEL: sext_8x16mem_to_8x64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x16mem_to_8x64: @@ -3988,7 +3988,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_2x32mem_to_2x64: @@ -4008,7 +4008,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovq2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_2x32mem_to_2x64mask: @@ -4043,7 +4043,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x32mem_to_4x64: @@ -4063,7 +4063,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovd2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x32mem_to_4x64mask: @@ -4131,7 +4131,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:1.00] +; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x32mem_to_8x64: @@ -4151,7 +4151,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x32mem_to_8x64mask: @@ -4169,7 +4169,7 @@ define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone { ; GENERIC-LABEL: sext_8x32mem_to_8x64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x32mem_to_8x64: @@ -4369,8 +4369,8 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { ; GENERIC-LABEL: sext_8i1_8i32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] -; GENERIC-NEXT: vpternlogq $15, %ymm0, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpternlogq $15, %ymm0, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8i1_8i32: @@ -4420,7 +4420,7 @@ define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind { ; GENERIC-LABEL: sext_8i1_8i16: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4439,7 +4439,7 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind { ; GENERIC-LABEL: sext_16i1_16i32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4456,7 +4456,7 @@ define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind { ; GENERIC-LABEL: sext_8i1_8i64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2q %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4473,7 +4473,7 @@ define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { ; GENERIC-LABEL: extload_v8i64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm0, (%rsi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4640,7 +4640,7 @@ define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 { ; GENERIC-LABEL: zext_64xi1_to_64xi8: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4657,7 +4657,7 @@ define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 { ; GENERIC-LABEL: zext_32xi1_to_32xi16: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2w %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: vpsrlw $15, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4676,7 +4676,7 @@ define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 { ; GENERIC-LABEL: zext_16xi1_to_16xi16: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: vpsrlw $15, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4694,7 +4694,7 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 { ; GENERIC-LABEL: zext_32xi1_to_32xi8: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5028,7 +5028,7 @@ define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpandd: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5048,7 +5048,7 @@ define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpandnd: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5070,7 +5070,7 @@ define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpord: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5090,7 +5090,7 @@ define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpxord: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5110,7 +5110,7 @@ define <8 x i64> @vpandq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpandq: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5129,7 +5129,7 @@ define <8 x i64> @vpandnq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpandnq: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5149,7 +5149,7 @@ define <8 x i64> @vporq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vporq: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5168,7 +5168,7 @@ define <8 x i64> @vpxorq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpxorq: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6867,7 +6867,7 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kshiftrw $5, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: andl $1, %eax # sched: [1:0.33] @@ -6891,7 +6891,7 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kshiftrw $5, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: andl $1, %eax # sched: [1:0.33] @@ -6917,7 +6917,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kshiftrw $5, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: andb $1, %al # sched: [1:0.33] @@ -6967,8 +6967,8 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) { ; GENERIC-LABEL: test4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpleq %ymm1, %ymm0, %k1 # sched: [1:0.50] -; GENERIC-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1} # sched: [1:0.50] +; GENERIC-NEXT: vpcmpleq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -6990,8 +6990,8 @@ define <2 x i64> @vcmp_test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) { ; GENERIC-LABEL: vcmp_test5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpcmpleq %xmm3, %xmm2, %k1 # sched: [1:0.50] -; GENERIC-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 {%k1} # sched: [1:0.50] +; GENERIC-NEXT: vpcmpleq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; GENERIC-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7059,7 +7059,7 @@ ; GENERIC-NEXT: retq # sched: [1:1.00] ; GENERIC-NEXT: .LBB386_1: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 # sched: [1:0.50] +; GENERIC-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7589,7 +7589,7 @@ define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) { ; GENERIC-LABEL: test_build_vec_v64i1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_build_vec_v64i1: @@ -8258,7 +8258,7 @@ define <16 x float> @_ss16xfloat_load(float* %a.ptr) { ; GENERIC-LABEL: _ss16xfloat_load: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_load: @@ -8275,7 +8275,7 @@ ; GENERIC-LABEL: _ss16xfloat_mask_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_mask_load: @@ -8295,7 +8295,7 @@ ; GENERIC-LABEL: _ss16xfloat_maskz_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_maskz_load: @@ -8369,7 +8369,7 @@ define <8 x double> @_sd8xdouble_load(double* %a.ptr) { ; GENERIC-LABEL: _sd8xdouble_load: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _sd8xdouble_load: @@ -8386,7 +8386,7 @@ ; GENERIC-LABEL: _sd8xdouble_mask_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _sd8xdouble_mask_load: @@ -8406,7 +8406,7 @@ ; GENERIC-LABEL: _sd8xdouble_maskz_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _sd8xdouble_maskz_load: @@ -8700,7 +8700,7 @@ ; GENERIC-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00] ; GENERIC-NEXT: callq func_f32 -; GENERIC-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:1.00] +; GENERIC-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [6:1.00] ; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33] ; GENERIC-NEXT: .cfi_def_cfa_offset 8 ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8732,7 +8732,7 @@ ; GENERIC-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00] ; GENERIC-NEXT: callq func_f64 -; GENERIC-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:1.00] +; GENERIC-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [6:1.00] ; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33] ; GENERIC-NEXT: .cfi_def_cfa_offset 8 ; GENERIC-NEXT: retq # sched: [1:1.00] Index: test/CodeGen/X86/avx512-shuffle-schedule.ll =================================================================== --- test/CodeGen/X86/avx512-shuffle-schedule.ll +++ test/CodeGen/X86/avx512-shuffle-schedule.ll @@ -202,7 +202,7 @@ ; GENERIC-LABEL: test_16xi16_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] -; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi16_perm_mem_mask0: @@ -219,7 +219,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_mem_mask0: @@ -240,7 +240,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask0: @@ -261,7 +261,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_mem_mask1: @@ -282,7 +282,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask1: @@ -303,7 +303,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_mem_mask2: @@ -324,7 +324,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask2: @@ -344,7 +344,7 @@ ; GENERIC-LABEL: test_16xi16_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] -; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi16_perm_mem_mask3: @@ -361,7 +361,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_mem_mask3: @@ -382,7 +382,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask3: @@ -596,7 +596,7 @@ ; GENERIC-LABEL: test_32xi16_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [6:0.50] -; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_mem_mask0: @@ -613,7 +613,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [6:0.50] ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mem_mask0: @@ -634,7 +634,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [6:0.50] ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask0: @@ -655,7 +655,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [6:0.50] ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mem_mask1: @@ -676,7 +676,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [6:0.50] ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask1: @@ -697,7 +697,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [6:0.50] ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mem_mask2: @@ -718,7 +718,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [6:0.50] ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask2: @@ -738,7 +738,7 @@ ; GENERIC-LABEL: test_32xi16_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [6:0.50] -; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_mem_mask3: @@ -755,7 +755,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [6:0.50] ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mem_mask3: @@ -776,7 +776,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [6:0.50] ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask3: @@ -990,7 +990,7 @@ ; GENERIC-LABEL: test_8xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50] -; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_perm_mem_mask0: @@ -1007,7 +1007,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi32_perm_mem_mask0: @@ -1028,7 +1028,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask0: @@ -1049,7 +1049,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi32_perm_mem_mask1: @@ -1070,7 +1070,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask1: @@ -1091,7 +1091,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi32_perm_mem_mask2: @@ -1112,7 +1112,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask2: @@ -1132,7 +1132,7 @@ ; GENERIC-LABEL: test_8xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50] -; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_perm_mem_mask3: @@ -1149,7 +1149,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi32_perm_mem_mask3: @@ -1170,7 +1170,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask3: @@ -1384,7 +1384,7 @@ ; GENERIC-LABEL: test_16xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [6:0.50] -; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_perm_mem_mask0: @@ -1401,7 +1401,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mem_mask0: @@ -1422,7 +1422,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask0: @@ -1443,7 +1443,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mem_mask1: @@ -1464,7 +1464,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask1: @@ -1485,7 +1485,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mem_mask2: @@ -1506,7 +1506,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask2: @@ -1526,7 +1526,7 @@ ; GENERIC-LABEL: test_16xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [6:0.50] -; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_perm_mem_mask3: @@ -1543,7 +1543,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mem_mask3: @@ -1564,7 +1564,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask3: @@ -1757,7 +1757,7 @@ define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) { ; GENERIC-LABEL: test_4xi64_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_perm_mem_mask0: @@ -1772,7 +1772,7 @@ ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mem_mask0: @@ -1791,7 +1791,7 @@ ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask0: @@ -1810,7 +1810,7 @@ ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mem_mask1: @@ -1829,7 +1829,7 @@ ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask1: @@ -1848,7 +1848,7 @@ ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mem_mask2: @@ -1867,7 +1867,7 @@ ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask2: @@ -1885,7 +1885,7 @@ define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) { ; GENERIC-LABEL: test_4xi64_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_perm_mem_mask3: @@ -1900,7 +1900,7 @@ ; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mem_mask3: @@ -1919,7 +1919,7 @@ ; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask3: @@ -2293,7 +2293,7 @@ ; GENERIC-LABEL: test_8xi64_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [6:0.50] -; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_perm_mem_mask0: @@ -2310,7 +2310,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mem_mask0: @@ -2331,7 +2331,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask0: @@ -2351,7 +2351,7 @@ ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask1: @@ -2370,7 +2370,7 @@ ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1: @@ -2390,7 +2390,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mem_mask2: @@ -2411,7 +2411,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask2: @@ -2430,7 +2430,7 @@ define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) { ; GENERIC-LABEL: test_8xi64_perm_imm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_perm_imm_mem_mask3: @@ -2445,7 +2445,7 @@ ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask3: @@ -2464,7 +2464,7 @@ ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3: @@ -2484,7 +2484,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mem_mask4: @@ -2505,7 +2505,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask4: @@ -2525,7 +2525,7 @@ ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask5: @@ -2544,7 +2544,7 @@ ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5: @@ -2563,7 +2563,7 @@ ; GENERIC-LABEL: test_8xi64_perm_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [6:0.50] -; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_perm_mem_mask6: @@ -2580,7 +2580,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mem_mask6: @@ -2601,7 +2601,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask6: @@ -2621,7 +2621,7 @@ ; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask7: @@ -2640,7 +2640,7 @@ ; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [8:1.00] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7: @@ -2853,7 +2853,7 @@ ; GENERIC-LABEL: test_8xfloat_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50] -; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_perm_mem_mask0: @@ -2870,7 +2870,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask0: @@ -2891,7 +2891,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask0: @@ -2912,7 +2912,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask1: @@ -2933,7 +2933,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask1: @@ -2954,7 +2954,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask2: @@ -2975,7 +2975,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask2: @@ -2995,7 +2995,7 @@ ; GENERIC-LABEL: test_8xfloat_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50] -; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_perm_mem_mask3: @@ -3012,7 +3012,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask3: @@ -3033,7 +3033,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask3: @@ -3247,7 +3247,7 @@ ; GENERIC-LABEL: test_16xfloat_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [6:0.50] -; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_perm_mem_mask0: @@ -3264,7 +3264,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask0: @@ -3285,7 +3285,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask0: @@ -3306,7 +3306,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask1: @@ -3327,7 +3327,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask1: @@ -3348,7 +3348,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask2: @@ -3369,7 +3369,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask2: @@ -3389,7 +3389,7 @@ ; GENERIC-LABEL: test_16xfloat_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [6:0.50] -; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_perm_mem_mask3: @@ -3406,7 +3406,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask3: @@ -3427,7 +3427,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [6:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask3: @@ -3620,7 +3620,7 @@ define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) { ; GENERIC-LABEL: test_4xdouble_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_perm_mem_mask0: @@ -3635,7 +3635,7 @@ ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask0: @@ -3654,7 +3654,7 @@ ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask0: @@ -3673,7 +3673,7 @@ ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask1: @@ -3692,7 +3692,7 @@ ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask1: @@ -3711,7 +3711,7 @@ ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask2: @@ -3730,7 +3730,7 @@ ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask2: @@ -3748,7 +3748,7 @@ define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) { ; GENERIC-LABEL: test_4xdouble_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_perm_mem_mask3: @@ -3763,7 +3763,7 @@ ; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask3: @@ -3782,7 +3782,7 @@ ; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask3: @@ -4156,7 +4156,7 @@ ; GENERIC-LABEL: test_8xdouble_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [6:0.50] -; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_perm_mem_mask0: @@ -4173,7 +4173,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask0: @@ -4194,7 +4194,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask0: @@ -4214,7 +4214,7 @@ ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask1: @@ -4233,7 +4233,7 @@ ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1: @@ -4253,7 +4253,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask2: @@ -4274,7 +4274,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask2: @@ -4293,7 +4293,7 @@ define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) { ; GENERIC-LABEL: test_8xdouble_perm_imm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_perm_imm_mem_mask3: @@ -4308,7 +4308,7 @@ ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask3: @@ -4327,7 +4327,7 @@ ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3: @@ -4347,7 +4347,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask4: @@ -4368,7 +4368,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask4: @@ -4388,7 +4388,7 @@ ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask5: @@ -4407,7 +4407,7 @@ ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5: @@ -4426,7 +4426,7 @@ ; GENERIC-LABEL: test_8xdouble_perm_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [6:0.50] -; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_perm_mem_mask6: @@ -4443,7 +4443,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask6: @@ -4464,7 +4464,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [6:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask6: @@ -4484,7 +4484,7 @@ ; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask7: @@ -4503,7 +4503,7 @@ ; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7: @@ -4535,7 +4535,7 @@ ; GENERIC-LABEL: test_masked_16xi8_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [6:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4555,7 +4555,7 @@ ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mask0: @@ -4572,7 +4572,7 @@ ; GENERIC-LABEL: test_masked_16xi8_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [6:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4592,7 +4592,7 @@ ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mask1: @@ -4609,7 +4609,7 @@ ; GENERIC-LABEL: test_masked_16xi8_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [6:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4629,7 +4629,7 @@ ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mask2: @@ -4659,7 +4659,7 @@ ; GENERIC-LABEL: test_masked_16xi8_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [6:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4679,7 +4679,7 @@ ; GENERIC-LABEL: test_masked_z_16xi8_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mask3: @@ -4713,7 +4713,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mem_mask0: @@ -4734,7 +4734,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask0: @@ -4755,7 +4755,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mem_mask1: @@ -4776,7 +4776,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask1: @@ -4797,7 +4797,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mem_mask2: @@ -4818,7 +4818,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask2: @@ -4855,7 +4855,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mem_mask3: @@ -4876,7 +4876,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask3: @@ -4895,7 +4895,7 @@ define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) { ; GENERIC-LABEL: test_32xi8_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi8_perm_mask0: @@ -4909,7 +4909,7 @@ ; GENERIC-LABEL: test_masked_32xi8_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [6:0.50] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4929,7 +4929,7 @@ ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mask0: @@ -4946,7 +4946,7 @@ ; GENERIC-LABEL: test_masked_32xi8_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [6:0.50] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4966,7 +4966,7 @@ ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mask1: @@ -4983,7 +4983,7 @@ ; GENERIC-LABEL: test_masked_32xi8_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [6:0.50] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5003,7 +5003,7 @@ ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mask2: @@ -5019,7 +5019,7 @@ define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) { ; GENERIC-LABEL: test_32xi8_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi8_perm_mask3: @@ -5033,7 +5033,7 @@ ; GENERIC-LABEL: test_masked_32xi8_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [6:0.50] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5053,7 +5053,7 @@ ; GENERIC-LABEL: test_masked_z_32xi8_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mask3: @@ -5070,7 +5070,7 @@ ; GENERIC-LABEL: test_32xi8_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi8_perm_mem_mask0: @@ -5087,7 +5087,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mem_mask0: @@ -5108,7 +5108,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask0: @@ -5129,7 +5129,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mem_mask1: @@ -5150,7 +5150,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask1: @@ -5171,7 +5171,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mem_mask2: @@ -5192,7 +5192,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask2: @@ -5212,7 +5212,7 @@ ; GENERIC-LABEL: test_32xi8_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi8_perm_mem_mask3: @@ -5229,7 +5229,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mem_mask3: @@ -5250,7 +5250,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask3: @@ -5269,7 +5269,7 @@ define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) { ; GENERIC-LABEL: test_64xi8_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_64xi8_perm_mask0: @@ -5283,7 +5283,7 @@ ; GENERIC-LABEL: test_masked_64xi8_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [6:0.50] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5303,7 +5303,7 @@ ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mask0: @@ -5320,7 +5320,7 @@ ; GENERIC-LABEL: test_masked_64xi8_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [6:0.50] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5340,7 +5340,7 @@ ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mask1: @@ -5357,7 +5357,7 @@ ; GENERIC-LABEL: test_masked_64xi8_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [6:0.50] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5377,7 +5377,7 @@ ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mask2: @@ -5393,7 +5393,7 @@ define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) { ; GENERIC-LABEL: test_64xi8_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_64xi8_perm_mask3: @@ -5407,7 +5407,7 @@ ; GENERIC-LABEL: test_masked_64xi8_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [6:0.50] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5427,7 +5427,7 @@ ; GENERIC-LABEL: test_masked_z_64xi8_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mask3: @@ -5444,7 +5444,7 @@ ; GENERIC-LABEL: test_64xi8_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [6:0.50] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_64xi8_perm_mem_mask0: @@ -5461,7 +5461,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mem_mask0: @@ -5482,7 +5482,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask0: @@ -5503,7 +5503,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mem_mask1: @@ -5524,7 +5524,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask1: @@ -5545,7 +5545,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mem_mask2: @@ -5566,7 +5566,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask2: @@ -5586,7 +5586,7 @@ ; GENERIC-LABEL: test_64xi8_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [6:0.50] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_64xi8_perm_mem_mask3: @@ -5603,7 +5603,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mem_mask3: @@ -5624,7 +5624,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50] ; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask3: @@ -6659,7 +6659,7 @@ define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) { ; GENERIC-LABEL: test_16xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi16_perm_high_mem_mask0: @@ -6674,7 +6674,7 @@ ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask0: @@ -6693,7 +6693,7 @@ ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask0: @@ -6712,7 +6712,7 @@ ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask1: @@ -6731,7 +6731,7 @@ ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask1: @@ -6750,7 +6750,7 @@ ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask2: @@ -6769,7 +6769,7 @@ ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask2: @@ -6787,7 +6787,7 @@ define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) { ; GENERIC-LABEL: test_16xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi16_perm_low_mem_mask3: @@ -6802,7 +6802,7 @@ ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask3: @@ -6821,7 +6821,7 @@ ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask3: @@ -6840,7 +6840,7 @@ ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask4: @@ -6859,7 +6859,7 @@ ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask4: @@ -6878,7 +6878,7 @@ ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask5: @@ -6897,7 +6897,7 @@ ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask5: @@ -6915,7 +6915,7 @@ define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) { ; GENERIC-LABEL: test_16xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi16_perm_high_mem_mask6: @@ -6930,7 +6930,7 @@ ; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask6: @@ -6949,7 +6949,7 @@ ; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask6: @@ -6968,7 +6968,7 @@ ; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask7: @@ -6987,7 +6987,7 @@ ; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask7: @@ -7340,7 +7340,7 @@ define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) { ; GENERIC-LABEL: test_32xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_high_mem_mask0: @@ -7355,7 +7355,7 @@ ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask0: @@ -7374,7 +7374,7 @@ ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask0: @@ -7393,7 +7393,7 @@ ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask1: @@ -7412,7 +7412,7 @@ ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask1: @@ -7431,7 +7431,7 @@ ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask2: @@ -7450,7 +7450,7 @@ ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask2: @@ -7468,7 +7468,7 @@ define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) { ; GENERIC-LABEL: test_32xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_low_mem_mask3: @@ -7483,7 +7483,7 @@ ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask3: @@ -7502,7 +7502,7 @@ ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask3: @@ -7521,7 +7521,7 @@ ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask4: @@ -7540,7 +7540,7 @@ ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask4: @@ -7558,7 +7558,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [6:1.00] ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7579,7 +7579,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [6:1.00] ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7600,7 +7600,7 @@ define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) { ; GENERIC-LABEL: test_32xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_high_mem_mask6: @@ -7615,7 +7615,7 @@ ; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask6: @@ -7634,7 +7634,7 @@ ; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask6: @@ -7653,7 +7653,7 @@ ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask7: @@ -7672,7 +7672,7 @@ ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask7: @@ -8233,7 +8233,7 @@ ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask0: @@ -8252,7 +8252,7 @@ ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask0: @@ -8271,7 +8271,7 @@ ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask1: @@ -8290,7 +8290,7 @@ ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask1: @@ -8309,7 +8309,7 @@ ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask2: @@ -8328,7 +8328,7 @@ ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask2: @@ -8361,7 +8361,7 @@ ; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask3: @@ -8380,7 +8380,7 @@ ; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask3: @@ -8587,7 +8587,7 @@ ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask0: @@ -8606,7 +8606,7 @@ ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask0: @@ -8625,7 +8625,7 @@ ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask1: @@ -8644,7 +8644,7 @@ ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask1: @@ -8663,7 +8663,7 @@ ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask2: @@ -8682,7 +8682,7 @@ ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask2: @@ -8715,7 +8715,7 @@ ; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask3: @@ -8734,7 +8734,7 @@ ; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask3: @@ -8941,7 +8941,7 @@ ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8962,7 +8962,7 @@ ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: @@ -8981,7 +8981,7 @@ ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9002,7 +9002,7 @@ ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: @@ -9021,7 +9021,7 @@ ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9042,7 +9042,7 @@ ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: @@ -9075,7 +9075,7 @@ ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9096,7 +9096,7 @@ ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: @@ -9288,7 +9288,7 @@ define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) { ; GENERIC-LABEL: test_16xfloat_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_shuff_mem_mask0: @@ -9303,7 +9303,7 @@ ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9324,7 +9324,7 @@ ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0: @@ -9343,7 +9343,7 @@ ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9364,7 +9364,7 @@ ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1: @@ -9383,7 +9383,7 @@ ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9404,7 +9404,7 @@ ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2: @@ -9422,7 +9422,7 @@ define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) { ; GENERIC-LABEL: test_16xfloat_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_shuff_mem_mask3: @@ -9437,7 +9437,7 @@ ; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9458,7 +9458,7 @@ ; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3: @@ -9665,7 +9665,7 @@ ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9686,7 +9686,7 @@ ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: @@ -9705,7 +9705,7 @@ ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [6:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9726,7 +9726,7 @@ ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: @@ -9745,7 +9745,7 @@ ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [6:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9766,7 +9766,7 @@ ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: @@ -9799,7 +9799,7 @@ ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9820,7 +9820,7 @@ ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: @@ -10012,7 +10012,7 @@ define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) { ; GENERIC-LABEL: test_8xdouble_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_shuff_mem_mask0: @@ -10027,7 +10027,7 @@ ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [6:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10048,7 +10048,7 @@ ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0: @@ -10067,7 +10067,7 @@ ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10088,7 +10088,7 @@ ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1: @@ -10107,7 +10107,7 @@ ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [6:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10128,7 +10128,7 @@ ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2: @@ -10146,7 +10146,7 @@ define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) { ; GENERIC-LABEL: test_8xdouble_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_shuff_mem_mask3: @@ -10161,7 +10161,7 @@ ; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [6:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10182,7 +10182,7 @@ ; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3: @@ -10374,7 +10374,7 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; GENERIC-LABEL: test_8xi32_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mem_mask0: @@ -10389,7 +10389,7 @@ ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10410,7 +10410,7 @@ ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: @@ -10429,7 +10429,7 @@ ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10450,7 +10450,7 @@ ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: @@ -10469,7 +10469,7 @@ ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10490,7 +10490,7 @@ ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: @@ -10508,7 +10508,7 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; GENERIC-LABEL: test_8xi32_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mem_mask3: @@ -10523,7 +10523,7 @@ ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10544,7 +10544,7 @@ ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: @@ -10736,7 +10736,7 @@ define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) { ; GENERIC-LABEL: test_16xi32_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_shuff_mem_mask0: @@ -10751,7 +10751,7 @@ ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10772,7 +10772,7 @@ ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask0: @@ -10791,7 +10791,7 @@ ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10812,7 +10812,7 @@ ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask1: @@ -10831,7 +10831,7 @@ ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10852,7 +10852,7 @@ ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask2: @@ -10870,7 +10870,7 @@ define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) { ; GENERIC-LABEL: test_16xi32_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_shuff_mem_mask3: @@ -10885,7 +10885,7 @@ ; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -10906,7 +10906,7 @@ ; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00] +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask3: @@ -11098,7 +11098,7 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; GENERIC-LABEL: test_4xi64_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mem_mask0: @@ -11113,7 +11113,7 @@ ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11134,7 +11134,7 @@ ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: @@ -11153,7 +11153,7 @@ ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11174,7 +11174,7 @@ ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: @@ -11193,7 +11193,7 @@ ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11214,7 +11214,7 @@ ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: @@ -11232,7 +11232,7 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; GENERIC-LABEL: test_4xi64_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mem_mask3: @@ -11247,7 +11247,7 @@ ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11268,7 +11268,7 @@ ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: @@ -11460,7 +11460,7 @@ define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) { ; GENERIC-LABEL: test_8xi64_shuff_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_shuff_mem_mask0: @@ -11475,7 +11475,7 @@ ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11496,7 +11496,7 @@ ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask0: @@ -11515,7 +11515,7 @@ ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11536,7 +11536,7 @@ ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask1: @@ -11555,7 +11555,7 @@ ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11576,7 +11576,7 @@ ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask2: @@ -11594,7 +11594,7 @@ define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) { ; GENERIC-LABEL: test_8xi64_shuff_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_shuff_mem_mask3: @@ -11609,7 +11609,7 @@ ; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [6:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11630,7 +11630,7 @@ ; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask3: @@ -11837,7 +11837,7 @@ ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11858,7 +11858,7 @@ ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0: @@ -11877,7 +11877,7 @@ ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11898,7 +11898,7 @@ ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1: @@ -11917,7 +11917,7 @@ ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11938,7 +11938,7 @@ ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2: @@ -11971,7 +11971,7 @@ ; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -11992,7 +11992,7 @@ ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3: @@ -12836,7 +12836,7 @@ ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00] +; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [6:1.00] ; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12857,7 +12857,7 @@ ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00] +; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0: @@ -12876,7 +12876,7 @@ ; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00] +; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [6:1.00] ; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -12897,7 +12897,7 @@ ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00] +; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1: @@ -13828,7 +13828,7 @@ ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13849,7 +13849,7 @@ ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0: @@ -13868,7 +13868,7 @@ ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13889,7 +13889,7 @@ ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1: @@ -13908,7 +13908,7 @@ ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13929,7 +13929,7 @@ ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2: @@ -13962,7 +13962,7 @@ ; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -13983,7 +13983,7 @@ ; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3: @@ -14827,7 +14827,7 @@ ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [6:1.00] ; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14848,7 +14848,7 @@ ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0: @@ -14867,7 +14867,7 @@ ; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [6:1.00] ; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -14888,7 +14888,7 @@ ; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [6:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1: Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -2864,8 +2864,8 @@ define <4 x i32>@test_int_x86_avx512_mask_prorv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x14,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x14,0xd9] ; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x14,0xd1] ; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x14,0xc1] ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] @@ -2884,8 +2884,8 @@ define <8 x i32>@test_int_x86_avx512_mask_prorv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x14,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x14,0xd9] ; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x14,0xd1] ; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x14,0xc1] ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] @@ -2904,8 +2904,8 @@ define <2 x i64>@test_int_x86_avx512_mask_prorv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x14,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x14,0xd9] ; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x14,0xd1] ; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x14,0xc1] ; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] @@ -2924,8 +2924,8 @@ define <4 x i64>@test_int_x86_avx512_mask_prorv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x14,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x14,0xd9] ; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x14,0xd1] ; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x14,0xc1] ; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] @@ -3024,8 +3024,8 @@ define <4 x i32>@test_int_x86_avx512_mask_prolv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x15,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x15,0xd9] ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x15,0xd1] ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x15,0xc1] ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] @@ -3044,8 +3044,8 @@ define <8 x i32>@test_int_x86_avx512_mask_prolv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x15,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x15,0xd9] ; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x15,0xd1] ; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x15,0xc1] ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] @@ -3064,8 +3064,8 @@ define <2 x i64>@test_int_x86_avx512_mask_prolv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x15,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x15,0xd9] ; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x15,0xd1] ; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x15,0xc1] ; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] @@ -3084,8 +3084,8 @@ define <4 x i64>@test_int_x86_avx512_mask_prolv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x15,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x15,0xd9] ; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x15,0xd1] ; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x15,0xc1] ; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] Index: test/CodeGen/X86/avx512vpopcntdq-schedule.ll =================================================================== --- test/CodeGen/X86/avx512vpopcntdq-schedule.ll +++ test/CodeGen/X86/avx512vpopcntdq-schedule.ll @@ -8,15 +8,15 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovw %esi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: #APP -; GENERIC-NEXT: vpopcntd %zmm1, %zmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpopcntd %zmm1, %zmm0 {%k1} # sched: [1:0.50] -; GENERIC-NEXT: vpopcntd %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50] -; GENERIC-NEXT: vpopcntd (%rdi), %zmm0 # sched: [8:0.50] -; GENERIC-NEXT: vpopcntd (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; GENERIC-NEXT: vpopcntd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; GENERIC-NEXT: vpopcntd (%rdi){1to16}, %zmm0 # sched: [8:0.50] -; GENERIC-NEXT: vpopcntd (%rdi){1to16}, %zmm0 {%k1} # sched: [8:0.50] -; GENERIC-NEXT: vpopcntd (%rdi){1to16}, %zmm0 {%k1} {z} # sched: [8:0.50] +; GENERIC-NEXT: vpopcntd %zmm1, %zmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpopcntd %zmm1, %zmm0 {%k1} # sched: [3:1.00] +; GENERIC-NEXT: vpopcntd %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00] +; GENERIC-NEXT: vpopcntd (%rdi), %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpopcntd (%rdi), %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpopcntd (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpopcntd (%rdi){1to16}, %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpopcntd (%rdi){1to16}, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpopcntd (%rdi){1to16}, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -28,12 +28,12 @@ ; ICELAKE-NEXT: vpopcntd %zmm1, %zmm0 # sched: [1:0.50] ; ICELAKE-NEXT: vpopcntd %zmm1, %zmm0 {%k1} # sched: [1:0.50] ; ICELAKE-NEXT: vpopcntd %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50] -; ICELAKE-NEXT: vpopcntd (%rdi), %zmm0 # sched: [8:0.50] -; ICELAKE-NEXT: vpopcntd (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; ICELAKE-NEXT: vpopcntd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; ICELAKE-NEXT: vpopcntd (%rdi){1to16}, %zmm0 # sched: [8:0.50] -; ICELAKE-NEXT: vpopcntd (%rdi){1to16}, %zmm0 {%k1} # sched: [8:0.50] -; ICELAKE-NEXT: vpopcntd (%rdi){1to16}, %zmm0 {%k1} {z} # sched: [8:0.50] +; ICELAKE-NEXT: vpopcntd (%rdi), %zmm0 # sched: [6:0.50] +; ICELAKE-NEXT: vpopcntd (%rdi), %zmm0 {%k1} # sched: [6:0.50] +; ICELAKE-NEXT: vpopcntd (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50] +; ICELAKE-NEXT: vpopcntd (%rdi){1to16}, %zmm0 # sched: [6:0.50] +; ICELAKE-NEXT: vpopcntd (%rdi){1to16}, %zmm0 {%k1} # sched: [6:0.50] +; ICELAKE-NEXT: vpopcntd (%rdi){1to16}, %zmm0 {%k1} {z} # sched: [6:0.50] ; ICELAKE-NEXT: #NO_APP ; ICELAKE-NEXT: vzeroupper # sched: [4:1.00] ; ICELAKE-NEXT: retq # sched: [7:1.00] @@ -46,15 +46,15 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovw %esi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: #APP -; GENERIC-NEXT: vpopcntq %zmm1, %zmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpopcntq %zmm1, %zmm0 {%k1} # sched: [1:0.50] -; GENERIC-NEXT: vpopcntq %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50] -; GENERIC-NEXT: vpopcntq (%rdi), %zmm0 # sched: [8:0.50] -; GENERIC-NEXT: vpopcntq (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; GENERIC-NEXT: vpopcntq (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; GENERIC-NEXT: vpopcntq (%rdi){1to8}, %zmm0 # sched: [8:0.50] -; GENERIC-NEXT: vpopcntq (%rdi){1to8}, %zmm0 {%k1} # sched: [8:0.50] -; GENERIC-NEXT: vpopcntq (%rdi){1to8}, %zmm0 {%k1} {z} # sched: [8:0.50] +; GENERIC-NEXT: vpopcntq %zmm1, %zmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpopcntq %zmm1, %zmm0 {%k1} # sched: [3:1.00] +; GENERIC-NEXT: vpopcntq %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00] +; GENERIC-NEXT: vpopcntq (%rdi), %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpopcntq (%rdi), %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpopcntq (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00] +; GENERIC-NEXT: vpopcntq (%rdi){1to8}, %zmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpopcntq (%rdi){1to8}, %zmm0 {%k1} # sched: [8:1.00] +; GENERIC-NEXT: vpopcntq (%rdi){1to8}, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -66,12 +66,12 @@ ; ICELAKE-NEXT: vpopcntq %zmm1, %zmm0 # sched: [1:0.50] ; ICELAKE-NEXT: vpopcntq %zmm1, %zmm0 {%k1} # sched: [1:0.50] ; ICELAKE-NEXT: vpopcntq %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50] -; ICELAKE-NEXT: vpopcntq (%rdi), %zmm0 # sched: [8:0.50] -; ICELAKE-NEXT: vpopcntq (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; ICELAKE-NEXT: vpopcntq (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; ICELAKE-NEXT: vpopcntq (%rdi){1to8}, %zmm0 # sched: [8:0.50] -; ICELAKE-NEXT: vpopcntq (%rdi){1to8}, %zmm0 {%k1} # sched: [8:0.50] -; ICELAKE-NEXT: vpopcntq (%rdi){1to8}, %zmm0 {%k1} {z} # sched: [8:0.50] +; ICELAKE-NEXT: vpopcntq (%rdi), %zmm0 # sched: [6:0.50] +; ICELAKE-NEXT: vpopcntq (%rdi), %zmm0 {%k1} # sched: [6:0.50] +; ICELAKE-NEXT: vpopcntq (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50] +; ICELAKE-NEXT: vpopcntq (%rdi){1to8}, %zmm0 # sched: [6:0.50] +; ICELAKE-NEXT: vpopcntq (%rdi){1to8}, %zmm0 {%k1} # sched: [6:0.50] +; ICELAKE-NEXT: vpopcntq (%rdi){1to8}, %zmm0 {%k1} {z} # sched: [6:0.50] ; ICELAKE-NEXT: #NO_APP ; ICELAKE-NEXT: vzeroupper # sched: [4:1.00] ; ICELAKE-NEXT: retq # sched: [7:1.00] Index: test/CodeGen/X86/mmx-schedule.ll =================================================================== --- test/CodeGen/X86/mmx-schedule.ll +++ test/CodeGen/X86/mmx-schedule.ll @@ -3409,7 +3409,7 @@ ; ZNVER1-LABEL: test_phsubsw: ; ZNVER1: # %bb.0: ; ZNVER1-NEXT: phsubsw %mm1, %mm0 # sched: [100:?] -; ZNVER1-NEXT: phsubsw (%rdi), %mm0 # sched: [100:?] +; ZNVER1-NEXT: phsubsw (%rdi), %mm0 # sched: [8:0.50] ; ZNVER1-NEXT: movq %mm0, %rax # sched: [2:1.00] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %a0, x86_mmx %a1) @@ -3591,7 +3591,7 @@ ; GENERIC-LABEL: test_pmaddwd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: pmaddwd %mm1, %mm0 # sched: [5:1.00] -; GENERIC-NEXT: pmaddwd (%rdi), %mm0 # sched: [10:1.00] +; GENERIC-NEXT: pmaddwd (%rdi), %mm0 # sched: [11:1.00] ; GENERIC-NEXT: movq %mm0, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3612,7 +3612,7 @@ ; SANDY-LABEL: test_pmaddwd: ; SANDY: # %bb.0: ; SANDY-NEXT: pmaddwd %mm1, %mm0 # sched: [5:1.00] -; SANDY-NEXT: pmaddwd (%rdi), %mm0 # sched: [10:1.00] +; SANDY-NEXT: pmaddwd (%rdi), %mm0 # sched: [11:1.00] ; SANDY-NEXT: movq %mm0, %rax # sched: [1:0.33] ; SANDY-NEXT: retq # sched: [1:1.00] ; @@ -3669,7 +3669,7 @@ ; GENERIC-LABEL: test_pmaddubsw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: pmaddubsw %mm1, %mm0 # sched: [5:1.00] -; GENERIC-NEXT: pmaddubsw (%rdi), %mm0 # sched: [10:1.00] +; GENERIC-NEXT: pmaddubsw (%rdi), %mm0 # sched: [11:1.00] ; GENERIC-NEXT: movq %mm0, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3690,7 +3690,7 @@ ; SANDY-LABEL: test_pmaddubsw: ; SANDY: # %bb.0: ; SANDY-NEXT: pmaddubsw %mm1, %mm0 # sched: [5:1.00] -; SANDY-NEXT: pmaddubsw (%rdi), %mm0 # sched: [10:1.00] +; SANDY-NEXT: pmaddubsw (%rdi), %mm0 # sched: [11:1.00] ; SANDY-NEXT: movq %mm0, %rax # sched: [1:0.33] ; SANDY-NEXT: retq # sched: [1:1.00] ; @@ -4114,7 +4114,7 @@ ; GENERIC-LABEL: test_pmulhrsw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: pmulhrsw %mm1, %mm0 # sched: [5:1.00] -; GENERIC-NEXT: pmulhrsw (%rdi), %mm0 # sched: [10:1.00] +; GENERIC-NEXT: pmulhrsw (%rdi), %mm0 # sched: [11:1.00] ; GENERIC-NEXT: movq %mm0, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4135,7 +4135,7 @@ ; SANDY-LABEL: test_pmulhrsw: ; SANDY: # %bb.0: ; SANDY-NEXT: pmulhrsw %mm1, %mm0 # sched: [5:1.00] -; SANDY-NEXT: pmulhrsw (%rdi), %mm0 # sched: [10:1.00] +; SANDY-NEXT: pmulhrsw (%rdi), %mm0 # sched: [11:1.00] ; SANDY-NEXT: movq %mm0, %rax # sched: [1:0.33] ; SANDY-NEXT: retq # sched: [1:1.00] ; @@ -4192,7 +4192,7 @@ ; GENERIC-LABEL: test_pmulhw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: pmulhw %mm1, %mm0 # sched: [5:1.00] -; GENERIC-NEXT: pmulhw (%rdi), %mm0 # sched: [10:1.00] +; GENERIC-NEXT: pmulhw (%rdi), %mm0 # sched: [11:1.00] ; GENERIC-NEXT: movq %mm0, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4213,7 +4213,7 @@ ; SANDY-LABEL: test_pmulhw: ; SANDY: # %bb.0: ; SANDY-NEXT: pmulhw %mm1, %mm0 # sched: [5:1.00] -; SANDY-NEXT: pmulhw (%rdi), %mm0 # sched: [10:1.00] +; SANDY-NEXT: pmulhw (%rdi), %mm0 # sched: [11:1.00] ; SANDY-NEXT: movq %mm0, %rax # sched: [1:0.33] ; SANDY-NEXT: retq # sched: [1:1.00] ; @@ -4270,7 +4270,7 @@ ; GENERIC-LABEL: test_pmulhuw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: pmulhuw %mm1, %mm0 # sched: [5:1.00] -; GENERIC-NEXT: pmulhuw (%rdi), %mm0 # sched: [10:1.00] +; GENERIC-NEXT: pmulhuw (%rdi), %mm0 # sched: [11:1.00] ; GENERIC-NEXT: movq %mm0, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4291,7 +4291,7 @@ ; SANDY-LABEL: test_pmulhuw: ; SANDY: # %bb.0: ; SANDY-NEXT: pmulhuw %mm1, %mm0 # sched: [5:1.00] -; SANDY-NEXT: pmulhuw (%rdi), %mm0 # sched: [10:1.00] +; SANDY-NEXT: pmulhuw (%rdi), %mm0 # sched: [11:1.00] ; SANDY-NEXT: movq %mm0, %rax # sched: [1:0.33] ; SANDY-NEXT: retq # sched: [1:1.00] ; @@ -4348,7 +4348,7 @@ ; GENERIC-LABEL: test_pmullw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: pmullw %mm1, %mm0 # sched: [5:1.00] -; GENERIC-NEXT: pmullw (%rdi), %mm0 # sched: [10:1.00] +; GENERIC-NEXT: pmullw (%rdi), %mm0 # sched: [11:1.00] ; GENERIC-NEXT: movq %mm0, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4369,7 +4369,7 @@ ; SANDY-LABEL: test_pmullw: ; SANDY: # %bb.0: ; SANDY-NEXT: pmullw %mm1, %mm0 # sched: [5:1.00] -; SANDY-NEXT: pmullw (%rdi), %mm0 # sched: [10:1.00] +; SANDY-NEXT: pmullw (%rdi), %mm0 # sched: [11:1.00] ; SANDY-NEXT: movq %mm0, %rax # sched: [1:0.33] ; SANDY-NEXT: retq # sched: [1:1.00] ; @@ -4426,7 +4426,7 @@ ; GENERIC-LABEL: test_pmuludq: ; GENERIC: # %bb.0: ; GENERIC-NEXT: pmuludq %mm1, %mm0 # sched: [5:1.00] -; GENERIC-NEXT: pmuludq (%rdi), %mm0 # sched: [10:1.00] +; GENERIC-NEXT: pmuludq (%rdi), %mm0 # sched: [11:1.00] ; GENERIC-NEXT: movq %mm0, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4447,7 +4447,7 @@ ; SANDY-LABEL: test_pmuludq: ; SANDY: # %bb.0: ; SANDY-NEXT: pmuludq %mm1, %mm0 # sched: [5:1.00] -; SANDY-NEXT: pmuludq (%rdi), %mm0 # sched: [10:1.00] +; SANDY-NEXT: pmuludq (%rdi), %mm0 # sched: [11:1.00] ; SANDY-NEXT: movq %mm0, %rax # sched: [1:0.33] ; SANDY-NEXT: retq # sched: [1:1.00] ; @@ -4582,7 +4582,7 @@ ; GENERIC-LABEL: test_psadbw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: psadbw %mm1, %mm0 # sched: [5:1.00] -; GENERIC-NEXT: psadbw (%rdi), %mm0 # sched: [10:1.00] +; GENERIC-NEXT: psadbw (%rdi), %mm0 # sched: [11:1.00] ; GENERIC-NEXT: movq %mm0, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4603,7 +4603,7 @@ ; SANDY-LABEL: test_psadbw: ; SANDY: # %bb.0: ; SANDY-NEXT: psadbw %mm1, %mm0 # sched: [5:1.00] -; SANDY-NEXT: psadbw (%rdi), %mm0 # sched: [10:1.00] +; SANDY-NEXT: psadbw (%rdi), %mm0 # sched: [11:1.00] ; SANDY-NEXT: movq %mm0, %rax # sched: [1:0.33] ; SANDY-NEXT: retq # sched: [1:1.00] ; Index: test/CodeGen/X86/pr30821.mir =================================================================== --- test/CodeGen/X86/pr30821.mir +++ test/CodeGen/X86/pr30821.mir @@ -64,9 +64,9 @@ ; data into $xmm[0-14] and volatile storing them later, leaving regalloc only ; $xmm15 to play with in the middle. ; Then, perform two virtreg load-and-store pairs, with the faulty code - ; sequence in the middle (MOVSDrm then MOVAPDmr on the same slot). The - ; virtreg gets spilt; the corresponding stack slots merged; and faulty code - ; sequence eliminated if LLVM is broken. + ; sequence in the middle (MOVSDrm then MOVAPDmr on the same slot). The virtreg + ; gets spilt; the corresponding stack slots merged; and faulty code sequence + ; eliminated if LLVM is broken. ; Make first 15 $xmm registers live $xmm0 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) Index: test/CodeGen/X86/sha-schedule.ll =================================================================== --- test/CodeGen/X86/sha-schedule.ll +++ test/CodeGen/X86/sha-schedule.ll @@ -12,7 +12,7 @@ ; GENERIC-LABEL: test_sha1msg1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: sha1msg1 %xmm1, %xmm0 # sched: [5:1.00] -; GENERIC-NEXT: sha1msg1 (%rdi), %xmm0 # sched: [11:1.00] +; GENERIC-NEXT: sha1msg1 (%rdi), %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; GOLDMONT-LABEL: test_sha1msg1: @@ -23,8 +23,8 @@ ; ; CANNONLAKE-LABEL: test_sha1msg1: ; CANNONLAKE: # %bb.0: -; CANNONLAKE-NEXT: sha1msg1 %xmm1, %xmm0 # sched: [4:0.33] -; CANNONLAKE-NEXT: sha1msg1 (%rdi), %xmm0 # sched: [10:0.50] +; CANNONLAKE-NEXT: sha1msg1 %xmm1, %xmm0 # sched: [5:1.00] +; CANNONLAKE-NEXT: sha1msg1 (%rdi), %xmm0 # sched: [10:1.00] ; CANNONLAKE-NEXT: retq # sched: [7:1.00] ; ; ZNVER1-LABEL: test_sha1msg1: @@ -43,7 +43,7 @@ ; GENERIC-LABEL: test_sha1msg2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: sha1msg2 %xmm1, %xmm0 # sched: [5:1.00] -; GENERIC-NEXT: sha1msg2 (%rdi), %xmm0 # sched: [11:1.00] +; GENERIC-NEXT: sha1msg2 (%rdi), %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; GOLDMONT-LABEL: test_sha1msg2: @@ -54,8 +54,8 @@ ; ; CANNONLAKE-LABEL: test_sha1msg2: ; CANNONLAKE: # %bb.0: -; CANNONLAKE-NEXT: sha1msg2 %xmm1, %xmm0 # sched: [4:0.33] -; CANNONLAKE-NEXT: sha1msg2 (%rdi), %xmm0 # sched: [10:0.50] +; CANNONLAKE-NEXT: sha1msg2 %xmm1, %xmm0 # sched: [5:1.00] +; CANNONLAKE-NEXT: sha1msg2 (%rdi), %xmm0 # sched: [10:1.00] ; CANNONLAKE-NEXT: retq # sched: [7:1.00] ; ; ZNVER1-LABEL: test_sha1msg2: @@ -74,7 +74,7 @@ ; GENERIC-LABEL: test_sha1nexte: ; GENERIC: # %bb.0: ; GENERIC-NEXT: sha1nexte %xmm1, %xmm0 # sched: [5:1.00] -; GENERIC-NEXT: sha1nexte (%rdi), %xmm0 # sched: [11:1.00] +; GENERIC-NEXT: sha1nexte (%rdi), %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; GOLDMONT-LABEL: test_sha1nexte: @@ -85,8 +85,8 @@ ; ; CANNONLAKE-LABEL: test_sha1nexte: ; CANNONLAKE: # %bb.0: -; CANNONLAKE-NEXT: sha1nexte %xmm1, %xmm0 # sched: [4:0.33] -; CANNONLAKE-NEXT: sha1nexte (%rdi), %xmm0 # sched: [10:0.50] +; CANNONLAKE-NEXT: sha1nexte %xmm1, %xmm0 # sched: [5:1.00] +; CANNONLAKE-NEXT: sha1nexte (%rdi), %xmm0 # sched: [10:1.00] ; CANNONLAKE-NEXT: retq # sched: [7:1.00] ; ; ZNVER1-LABEL: test_sha1nexte: @@ -105,7 +105,7 @@ ; GENERIC-LABEL: test_sha1rnds4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: sha1rnds4 $3, %xmm1, %xmm0 # sched: [5:1.00] -; GENERIC-NEXT: sha1rnds4 $3, (%rdi), %xmm0 # sched: [11:1.00] +; GENERIC-NEXT: sha1rnds4 $3, (%rdi), %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; GOLDMONT-LABEL: test_sha1rnds4: @@ -116,8 +116,8 @@ ; ; CANNONLAKE-LABEL: test_sha1rnds4: ; CANNONLAKE: # %bb.0: -; CANNONLAKE-NEXT: sha1rnds4 $3, %xmm1, %xmm0 # sched: [4:0.33] -; CANNONLAKE-NEXT: sha1rnds4 $3, (%rdi), %xmm0 # sched: [10:0.50] +; CANNONLAKE-NEXT: sha1rnds4 $3, %xmm1, %xmm0 # sched: [5:1.00] +; CANNONLAKE-NEXT: sha1rnds4 $3, (%rdi), %xmm0 # sched: [10:1.00] ; CANNONLAKE-NEXT: retq # sched: [7:1.00] ; ; ZNVER1-LABEL: test_sha1rnds4: @@ -140,7 +140,7 @@ ; GENERIC-LABEL: test_sha256msg1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: sha256msg1 %xmm1, %xmm0 # sched: [5:1.00] -; GENERIC-NEXT: sha256msg1 (%rdi), %xmm0 # sched: [11:1.00] +; GENERIC-NEXT: sha256msg1 (%rdi), %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; GOLDMONT-LABEL: test_sha256msg1: @@ -151,8 +151,8 @@ ; ; CANNONLAKE-LABEL: test_sha256msg1: ; CANNONLAKE: # %bb.0: -; CANNONLAKE-NEXT: sha256msg1 %xmm1, %xmm0 # sched: [4:0.33] -; CANNONLAKE-NEXT: sha256msg1 (%rdi), %xmm0 # sched: [10:0.50] +; CANNONLAKE-NEXT: sha256msg1 %xmm1, %xmm0 # sched: [5:1.00] +; CANNONLAKE-NEXT: sha256msg1 (%rdi), %xmm0 # sched: [10:1.00] ; CANNONLAKE-NEXT: retq # sched: [7:1.00] ; ; ZNVER1-LABEL: test_sha256msg1: @@ -171,7 +171,7 @@ ; GENERIC-LABEL: test_sha256msg2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: sha256msg2 %xmm1, %xmm0 # sched: [5:1.00] -; GENERIC-NEXT: sha256msg2 (%rdi), %xmm0 # sched: [11:1.00] +; GENERIC-NEXT: sha256msg2 (%rdi), %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; GOLDMONT-LABEL: test_sha256msg2: @@ -182,8 +182,8 @@ ; ; CANNONLAKE-LABEL: test_sha256msg2: ; CANNONLAKE: # %bb.0: -; CANNONLAKE-NEXT: sha256msg2 %xmm1, %xmm0 # sched: [4:0.33] -; CANNONLAKE-NEXT: sha256msg2 (%rdi), %xmm0 # sched: [10:0.50] +; CANNONLAKE-NEXT: sha256msg2 %xmm1, %xmm0 # sched: [5:1.00] +; CANNONLAKE-NEXT: sha256msg2 (%rdi), %xmm0 # sched: [10:1.00] ; CANNONLAKE-NEXT: retq # sched: [7:1.00] ; ; ZNVER1-LABEL: test_sha256msg2: @@ -204,7 +204,7 @@ ; GENERIC-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00] ; GENERIC-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [5:1.00] -; GENERIC-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [11:1.00] +; GENERIC-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [10:1.00] ; GENERIC-NEXT: movaps %xmm3, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -221,8 +221,8 @@ ; CANNONLAKE: # %bb.0: ; CANNONLAKE-NEXT: vmovaps %xmm0, %xmm3 # sched: [1:0.33] ; CANNONLAKE-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33] -; CANNONLAKE-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [4:0.33] -; CANNONLAKE-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [10:0.50] +; CANNONLAKE-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [5:1.00] +; CANNONLAKE-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [10:1.00] ; CANNONLAKE-NEXT: vmovaps %xmm3, %xmm0 # sched: [1:0.33] ; CANNONLAKE-NEXT: retq # sched: [7:1.00] ; Index: test/CodeGen/X86/sse4a-schedule.ll =================================================================== --- test/CodeGen/X86/sse4a-schedule.ll +++ test/CodeGen/X86/sse4a-schedule.ll @@ -6,7 +6,7 @@ define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) { ; GENERIC-LABEL: test_extrq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: extrq %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: extrq %xmm1, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; BTVER2-LABEL: test_extrq: @@ -26,7 +26,7 @@ define <2 x i64> @test_extrqi(<2 x i64> %a0) { ; GENERIC-LABEL: test_extrqi: ; GENERIC: # %bb.0: -; GENERIC-NEXT: extrq $2, $3, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: extrq $2, $3, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; BTVER2-LABEL: test_extrqi: @@ -46,7 +46,7 @@ define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) { ; GENERIC-LABEL: test_insertq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: insertq %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: insertq %xmm1, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; BTVER2-LABEL: test_insertq: @@ -66,7 +66,7 @@ define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) { ; GENERIC-LABEL: test_insertqi: ; GENERIC: # %bb.0: -; GENERIC-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; BTVER2-LABEL: test_insertqi: Index: test/CodeGen/X86/xop-schedule.ll =================================================================== --- test/CodeGen/X86/xop-schedule.ll +++ test/CodeGen/X86/xop-schedule.ll @@ -124,8 +124,8 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP ; GENERIC-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpcmov (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00] -; GENERIC-NEXT: vpcmov %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpcmov (%rdi), %ymm1, %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpcmov %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -147,14 +147,14 @@ ; GENERIC-LABEL: test_vpcom: ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP -; GENERIC-NEXT: vpcomb $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcomd $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcomq $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcomw $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcomb $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50] -; GENERIC-NEXT: vpcomd $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50] -; GENERIC-NEXT: vpcomq $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50] -; GENERIC-NEXT: vpcomw $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; GENERIC-NEXT: vpcomb $3, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcomd $3, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcomq $3, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcomw $3, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcomb $3, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpcomd $3, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpcomq $3, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpcomw $3, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -179,14 +179,14 @@ ; GENERIC-LABEL: test_vpcomu: ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP -; GENERIC-NEXT: vpcomub $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcomud $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcomuq $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcomuw $3, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpcomub $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50] -; GENERIC-NEXT: vpcomud $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50] -; GENERIC-NEXT: vpcomuq $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50] -; GENERIC-NEXT: vpcomuw $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; GENERIC-NEXT: vpcomub $3, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcomud $3, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcomuq $3, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcomuw $3, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpcomub $3, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpcomud $3, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpcomuq $3, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; GENERIC-NEXT: vpcomuw $3, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -212,8 +212,8 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP ; GENERIC-NEXT: vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [6:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -234,8 +234,8 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP ; GENERIC-NEXT: vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; GENERIC-NEXT: vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -258,8 +258,8 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP ; GENERIC-NEXT: vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [6:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -280,8 +280,8 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP ; GENERIC-NEXT: vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; GENERIC-NEXT: vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:1.00] +; GENERIC-NEXT: vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; GENERIC-NEXT: vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [6:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -724,7 +724,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP ; GENERIC-NEXT: vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; GENERIC-NEXT: vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -744,7 +744,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP ; GENERIC-NEXT: vpmacssww %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmacssww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; GENERIC-NEXT: vpmacssww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -764,7 +764,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP ; GENERIC-NEXT: vpmacswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmacswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; GENERIC-NEXT: vpmacswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -784,7 +784,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP ; GENERIC-NEXT: vpmacsww %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmacsww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; GENERIC-NEXT: vpmacsww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -804,7 +804,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP ; GENERIC-NEXT: vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; GENERIC-NEXT: vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -824,7 +824,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP ; GENERIC-NEXT: vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; GENERIC-NEXT: vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -844,8 +844,8 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP ; GENERIC-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: vpperm (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:0.50] -; GENERIC-NEXT: vpperm %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; GENERIC-NEXT: vpperm (%rdi), %xmm1, %xmm0, %xmm0 # sched: [6:0.50] +; GENERIC-NEXT: vpperm %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:0.50] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -869,22 +869,22 @@ ; GENERIC-NEXT: vprotd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vprotq %xmm1, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vprotw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vprotb (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vprotd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vprotq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vprotw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vprotb %xmm0, (%rdi), %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vprotd %xmm0, (%rdi), %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vprotq %xmm0, (%rdi), %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vprotw %xmm0, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vprotb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vprotd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vprotq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vprotw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vprotb %xmm0, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vprotd %xmm0, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vprotq %xmm0, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vprotw %xmm0, (%rdi), %xmm0 # sched: [6:1.00] ; GENERIC-NEXT: vprotb $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vprotd $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vprotq $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vprotw $7, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vprotb $7, (%rdi), %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vprotd $7, (%rdi), %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vprotq $7, (%rdi), %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vprotw $7, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vprotb $7, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vprotd $7, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vprotq $7, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vprotw $7, (%rdi), %xmm0 # sched: [6:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -925,14 +925,14 @@ ; GENERIC-NEXT: vpshad %xmm1, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpshaq %xmm1, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpshaw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpshab (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshad (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshaq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshaw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshab %xmm0, (%rdi), %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshad %xmm0, (%rdi), %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshaq %xmm0, (%rdi), %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshaw %xmm0, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshab (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshad (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshaq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshaw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshab %xmm0, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshad %xmm0, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshaq %xmm0, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshaw %xmm0, (%rdi), %xmm0 # sched: [6:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -965,14 +965,14 @@ ; GENERIC-NEXT: vpshld %xmm1, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpshlq %xmm1, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpshlw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpshlb (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshld (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshlq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshlw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshlb %xmm0, (%rdi), %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshld %xmm0, (%rdi), %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshlq %xmm0, (%rdi), %xmm0 # sched: [7:1.00] -; GENERIC-NEXT: vpshlw %xmm0, (%rdi), %xmm0 # sched: [7:1.00] +; GENERIC-NEXT: vpshlb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshld (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshlq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshlw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshlb %xmm0, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshld %xmm0, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshlq %xmm0, (%rdi), %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpshlw %xmm0, (%rdi), %xmm0 # sched: [6:1.00] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; Index: test/LTO/Resolution/X86/diagnostic-handler-remarks.ll =================================================================== --- test/LTO/Resolution/X86/diagnostic-handler-remarks.ll +++ test/LTO/Resolution/X86/diagnostic-handler-remarks.ll @@ -1,7 +1,5 @@ -; Test of LTO with opt remarks YAML output. - -; First try with Regular LTO ; RUN: llvm-as < %s >%t.bc + ; RUN: rm -f %t.yaml ; RUN: llvm-lto2 run -pass-remarks-output=%t.yaml \ ; RUN: -r %t.bc,tinkywinky,p \ @@ -9,15 +7,6 @@ ; RUN: -r %t.bc,main,px -o %t.o %t.bc ; RUN: cat %t.yaml | FileCheck %s -check-prefix=YAML -; Try again with ThinLTO -; RUN: opt -module-summary %s -o %t.bc -; RUN: rm -f %t.thin.1.yaml -; RUN: llvm-lto2 run -pass-remarks-output=%t \ -; RUN: -r %t.bc,tinkywinky,p \ -; RUN: -r %t.bc,patatino,px \ -; RUN: -r %t.bc,main,px -o %t.o %t.bc -; RUN: cat %t.thin.1.yaml | FileCheck %s -check-prefix=YAML - ; YAML: --- !Passed ; YAML-NEXT: Pass: inline ; YAML-NEXT: Name: Inlined Index: test/MC/AArch64/coff-relocations.s =================================================================== --- test/MC/AArch64/coff-relocations.s +++ test/MC/AArch64/coff-relocations.s @@ -52,15 +52,6 @@ ; IMAGE_REL_ARM64_SECREL_LOW12L ldr x0, [x0, :secrel_lo12:foo] -; IMAGE_REL_ARM64_REL21 -adr x0, foo + 0x12345 - -; IMAGE_REL_ARM64_BRANCH19 -bne target - -; IMAGE_REL_ARM64_BRANCH14 -tbz x0, #0, target - ; CHECK: Format: COFF-ARM64 ; CHECK: Arch: aarch64 ; CHECK: AddressSize: 64bit @@ -83,9 +74,6 @@ ; CHECK: 0x40 IMAGE_REL_ARM64_SECREL_LOW12A foo ; CHECK: 0x44 IMAGE_REL_ARM64_SECREL_HIGH12A foo ; CHECK: 0x48 IMAGE_REL_ARM64_SECREL_LOW12L foo -; CHECK: 0x4C IMAGE_REL_ARM64_REL21 foo -; CHECK: 0x50 IMAGE_REL_ARM64_BRANCH19 target -; CHECK: 0x54 IMAGE_REL_ARM64_BRANCH14 target ; CHECK: } ; CHECK: ] @@ -96,4 +84,3 @@ ; DISASM: 40: 00 00 00 91 add x0, x0, #0 ; DISASM: 44: 00 00 40 91 add x0, x0, #0, lsl #12 ; DISASM: 48: 00 00 40 f9 ldr x0, [x0] -; DISASM: 4c: 20 1a 09 30 adr x0, #74565 Index: test/MC/WebAssembly/blockaddress.ll =================================================================== --- test/MC/WebAssembly/blockaddress.ll +++ test/MC/WebAssembly/blockaddress.ll @@ -1,6 +1,6 @@ ; TODO(sbc): Make this test pass by adding support for unnamed tempoaries ; in wasm relocations. -; RUN: not llc -filetype=obj %s -o /dev/null 2>&1 | FileCheck %s +; RUN: not llc -filetype=obj %s target triple = "wasm32-unknown-unknown-wasm" @@ -13,5 +13,3 @@ addr: ret i32 0 } - -; CHECK: LLVM ERROR: relocations for function or section offsets are only supported in metadata sections Index: test/MC/WebAssembly/debug-info.ll =================================================================== --- test/MC/WebAssembly/debug-info.ll +++ test/MC/WebAssembly/debug-info.ll @@ -96,38 +96,38 @@ ; CHECK-NEXT: } ; CHECK-NEXT: Section { ; CHECK-NEXT: Type: CUSTOM (0x0) -; CHECK-NEXT: Size: 88 +; CHECK-NEXT: Size: 100 ; CHECK-NEXT: Offset: 733 ; CHECK-NEXT: Name: linking ; CHECK-NEXT: } ; CHECK-NEXT: Section { ; CHECK-NEXT: Type: CUSTOM (0x0) ; CHECK-NEXT: Size: 9 -; CHECK-NEXT: Offset: 835 +; CHECK-NEXT: Offset: 847 ; CHECK-NEXT: Name: reloc.DATA ; CHECK-NEXT: } ; CHECK-NEXT: Section { ; CHECK-NEXT: Type: CUSTOM (0x0) ; CHECK-NEXT: Size: 58 -; CHECK-NEXT: Offset: 861 +; CHECK-NEXT: Offset: 873 ; CHECK-NEXT: Name: reloc..debug_info ; CHECK-NEXT: } ; CHECK-NEXT: Section { ; CHECK-NEXT: Type: CUSTOM (0x0) ; CHECK-NEXT: Size: 6 -; CHECK-NEXT: Offset: 943 +; CHECK-NEXT: Offset: 955 ; CHECK-NEXT: Name: reloc..debug_pubnames ; CHECK-NEXT: } ; CHECK-NEXT: Section { ; CHECK-NEXT: Type: CUSTOM (0x0) ; CHECK-NEXT: Size: 6 -; CHECK-NEXT: Offset: 977 +; CHECK-NEXT: Offset: 989 ; CHECK-NEXT: Name: reloc..debug_pubtypes ; CHECK-NEXT: } ; CHECK-NEXT: Section { ; CHECK-NEXT: Type: CUSTOM (0x0) ; CHECK-NEXT: Size: 6 -; CHECK-NEXT: Offset: 1011 +; CHECK-NEXT: Offset: 1023 ; CHECK-NEXT: Name: reloc..debug_line ; CHECK-NEXT: } ; CHECK-NEXT:] @@ -163,37 +163,37 @@ ; CHECK-NEXT:] ; CHECK-NEXT:Symbols [ ; CHECK-NEXT: Symbol { -; CHECK-NEXT: Name: f2 -; CHECK-NEXT: Type: FUNCTION (0x0) -; CHECK-NEXT: Flags: 0x4 +; CHECK-NEXT: Name: .debug_str +; CHECK-NEXT: Type: SECTION (0x3) +; CHECK-NEXT: Flags: 0x2 ; CHECK-NEXT: } ; CHECK-NEXT: Symbol { -; CHECK-NEXT: Name: foo -; CHECK-NEXT: Type: DATA (0x1) -; CHECK-NEXT: Flags: 0x4 +; CHECK-NEXT: Name: .debug_abbrev +; CHECK-NEXT: Type: SECTION (0x3) +; CHECK-NEXT: Flags: 0x2 ; CHECK-NEXT: } ; CHECK-NEXT: Symbol { -; CHECK-NEXT: Name: myextern -; CHECK-NEXT: Type: DATA (0x1) -; CHECK-NEXT: Flags: 0x10 +; CHECK-NEXT: Name: .debug_info +; CHECK-NEXT: Type: SECTION (0x3) +; CHECK-NEXT: Flags: 0x2 ; CHECK-NEXT: } ; CHECK-NEXT: Symbol { -; CHECK-NEXT: Name: ptr2 -; CHECK-NEXT: Type: DATA (0x1) -; CHECK-NEXT: Flags: 0x4 +; CHECK-NEXT: Name: .debug_ranges +; CHECK-NEXT: Type: SECTION (0x3) +; CHECK-NEXT: Flags: 0x2 ; CHECK-NEXT: } ; CHECK-NEXT: Symbol { -; CHECK-NEXT: Name: .debug_str +; CHECK-NEXT: Name: .debug_macinfo ; CHECK-NEXT: Type: SECTION (0x3) ; CHECK-NEXT: Flags: 0x2 ; CHECK-NEXT: } ; CHECK-NEXT: Symbol { -; CHECK-NEXT: Name: .debug_abbrev +; CHECK-NEXT: Name: .debug_pubnames ; CHECK-NEXT: Type: SECTION (0x3) ; CHECK-NEXT: Flags: 0x2 ; CHECK-NEXT: } ; CHECK-NEXT: Symbol { -; CHECK-NEXT: Name: .debug_info +; CHECK-NEXT: Name: .debug_pubtypes ; CHECK-NEXT: Type: SECTION (0x3) ; CHECK-NEXT: Flags: 0x2 ; CHECK-NEXT: } @@ -202,6 +202,26 @@ ; CHECK-NEXT: Type: SECTION (0x3) ; CHECK-NEXT: Flags: 0x2 ; CHECK-NEXT: } +; CHECK-NEXT: Symbol { +; CHECK-NEXT: Name: f2 +; CHECK-NEXT: Type: FUNCTION (0x0) +; CHECK-NEXT: Flags: 0x4 +; CHECK-NEXT: } +; CHECK-NEXT: Symbol { +; CHECK-NEXT: Name: foo +; CHECK-NEXT: Type: DATA (0x1) +; CHECK-NEXT: Flags: 0x4 +; CHECK-NEXT: } +; CHECK-NEXT: Symbol { +; CHECK-NEXT: Name: myextern +; CHECK-NEXT: Type: DATA (0x1) +; CHECK-NEXT: Flags: 0x10 +; CHECK-NEXT: } +; CHECK-NEXT: Symbol { +; CHECK-NEXT: Name: ptr2 +; CHECK-NEXT: Type: DATA (0x1) +; CHECK-NEXT: Flags: 0x4 +; CHECK-NEXT: } ; CHECK-NEXT:] target triple = "wasm32-unknown-unknown-wasm" Index: test/Other/Inputs/invariant.group.barrier.ll =================================================================== --- /dev/null +++ test/Other/Inputs/invariant.group.barrier.ll @@ -0,0 +1,15 @@ +; RUN: opt -S -gvn < %s | FileCheck %s +; RUN: opt -S -newgvn < %s | FileCheck %s +; RUN: opt -S -O3 < %s | FileCheck %s + +; This test check if optimizer is not proving equality based on mustalias +; CHECK-LABEL: define void @dontProveEquality(i8* %a) +define void @dontProveEquality(i8* %a) { + %b = call i8* @llvm.invariant.group.barrier(i8* %a) + %r = i1 icmp eq i8* %b, i8* %a +;CHECK: call void @use(%r) + call void @use(%r) +} + +declare void @use(i1) +declare i8* @llvm.invariant.group.barrier(i8 *) Index: test/Other/invariant.group.barrier.ll =================================================================== --- /dev/null +++ test/Other/invariant.group.barrier.ll @@ -0,0 +1,83 @@ +; RUN: opt -S -early-cse < %s | FileCheck %s +; RUN: opt -S -gvn < %s | FileCheck %s +; RUN: opt -S -newgvn < %s | FileCheck %s +; RUN: opt -S -O3 < %s | FileCheck %s + +; These tests checks if passes with CSE functionality can do CSE on +; invariant.group.barrier, that is prohibited if there is a memory clobber +; between barriers call. + +; CHECK-LABEL: define i8 @optimizable() +define i8 @optimizable() { +entry: + %ptr = alloca i8 + store i8 42, i8* %ptr, !invariant.group !0 +; CHECK: call i8* @llvm.invariant.group.barrier.p0i8 + %ptr2 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) +; FIXME: This one could be CSE +; CHECK: call i8* @llvm.invariant.group.barrier + %ptr3 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) +; CHECK: call void @clobber(i8* {{.*}}%ptr) + call void @clobber(i8* %ptr) + +; CHECK: call void @use(i8* {{.*}}%ptr2) + call void @use(i8* %ptr2) +; CHECK: call void @use(i8* {{.*}}%ptr3) + call void @use(i8* %ptr3) +; CHECK: load i8, i8* %ptr3, {{.*}}!invariant.group + %v = load i8, i8* %ptr3, !invariant.group !0 + + ret i8 %v +} + +; CHECK-LABEL: define i8 @unoptimizable() +define i8 @unoptimizable() { +entry: + %ptr = alloca i8 + store i8 42, i8* %ptr, !invariant.group !0 +; CHECK: call i8* @llvm.invariant.group.barrier.p0i8 + %ptr2 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) + call void @clobber(i8* %ptr) +; CHECK: call i8* @llvm.invariant.group.barrier.p0i8 + %ptr3 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) +; CHECK: call void @clobber(i8* {{.*}}%ptr) + call void @clobber(i8* %ptr) +; CHECK: call void @use(i8* {{.*}}%ptr2) + call void @use(i8* %ptr2) +; CHECK: call void @use(i8* {{.*}}%ptr3) + call void @use(i8* %ptr3) +; CHECK: load i8, i8* %ptr3, {{.*}}!invariant.group + %v = load i8, i8* %ptr3, !invariant.group !0 + + ret i8 %v +} + +; CHECK-LABEL: define i8 @unoptimizable2() +define i8 @unoptimizable2() { + %ptr = alloca i8 + store i8 42, i8* %ptr, !invariant.group !0 +; CHECK: call i8* @llvm.invariant.group.barrier + %ptr2 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) + store i8 43, i8* %ptr +; CHECK: call i8* @llvm.invariant.group.barrier + %ptr3 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) +; CHECK: call void @clobber(i8* {{.*}}%ptr) + call void @clobber(i8* %ptr) +; CHECK: call void @use(i8* {{.*}}%ptr2) + call void @use(i8* %ptr2) +; CHECK: call void @use(i8* {{.*}}%ptr3) + call void @use(i8* %ptr3) +; CHECK: load i8, i8* %ptr3, {{.*}}!invariant.group + %v = load i8, i8* %ptr3, !invariant.group !0 + ret i8 %v +} + +declare void @use(i8* readonly) + +declare void @clobber(i8*) +; CHECK: Function Attrs: inaccessiblememonly nounwind{{$}} +; CHECK-NEXT: declare i8* @llvm.invariant.group.barrier.p0i8(i8*) +declare i8* @llvm.invariant.group.barrier.p0i8(i8*) + +!0 = !{} + Index: test/TableGen/GlobalISelEmitter.td =================================================================== --- test/TableGen/GlobalISelEmitter.td +++ test/TableGen/GlobalISelEmitter.td @@ -78,7 +78,6 @@ // CHECK-NEXT: bool testImmPredicate_I64(unsigned PredicateID, int64_t Imm) const override; // CHECK-NEXT: bool testImmPredicate_APInt(unsigned PredicateID, const APInt &Imm) const override; // CHECK-NEXT: bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat &Imm) const override; -// CHECK-NEXT: const int64_t *getMatchTable() const override; // CHECK-NEXT: #endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL // CHECK-LABEL: #ifdef GET_GLOBALISEL_TEMPORARIES_INIT @@ -222,16 +221,10 @@ // CHECK-NEXT: State.MIs.clear(); // CHECK-NEXT: State.MIs.push_back(&I); -// CHECK: if (executeMatchTable(*this, OutMIs, State, ISelInfo, getMatchTable(), TII, MRI, TRI, RBI, AvailableFeatures, CoverageInfo)) { -// CHECK-NEXT: return true; -// CHECK-NEXT: } - //===- Test a pattern with multiple ComplexPatterns in multiple instrs ----===// // -// CHECK: const int64_t * -// CHECK-LABEL: MyTargetInstructionSelector::getMatchTable() const { -// CHECK: MatchTable0[] = { +// CHECK-LABEL: MatchTable0[] = { // OPT-NEXT: GIM_Try, /*On fail goto*//*Label [[GRP_LABEL_NUM:[0-9]+]]*/ [[GRP_LABEL:[0-9]+]], // OPT-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_SELECT, // CHECK-NEXT: GIM_Try, /*On fail goto*//*Label [[LABEL_NUM:[0-9]+]]*/ [[LABEL:[0-9]+]], @@ -1107,4 +1100,6 @@ // CHECK-NEXT: GIM_Reject, // CHECK-NEXT: }; -// CHECK-NEXT: return MatchTable0; +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, ISelInfo, MatchTable0, TII, MRI, TRI, RBI, AvailableFeatures, CoverageInfo)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } Index: test/Transforms/CodeGenPrepare/invariant.group.ll =================================================================== --- test/Transforms/CodeGenPrepare/invariant.group.ll +++ test/Transforms/CodeGenPrepare/invariant.group.ll @@ -6,10 +6,10 @@ define void @foo() { enter: ; CHECK-NOT: !invariant.group - ; CHECK-NOT: @llvm.launder.invariant.group.p0i8( + ; CHECK-NOT: @llvm.invariant.group.barrier.p0i8( ; CHECK: %val = load i8, i8* @tmp, !tbaa %val = load i8, i8* @tmp, !invariant.group !0, !tbaa !{!1, !1, i64 0} - %ptr = call i8* @llvm.launder.invariant.group.p0i8(i8* @tmp) + %ptr = call i8* @llvm.invariant.group.barrier.p0i8(i8* @tmp) ; CHECK: store i8 42, i8* @tmp store i8 42, i8* %ptr, !invariant.group !0 @@ -18,7 +18,7 @@ } ; CHECK-LABEL: } -declare i8* @llvm.launder.invariant.group.p0i8(i8*) +declare i8* @llvm.invariant.group.barrier.p0i8(i8*) !0 = !{!"something"} !1 = !{!"x", !0} Index: test/Transforms/GCOVProfiling/function-numbering.ll =================================================================== --- test/Transforms/GCOVProfiling/function-numbering.ll +++ test/Transforms/GCOVProfiling/function-numbering.ll @@ -19,72 +19,9 @@ ; GCDA: @[[FOO:[0-9]+]] = private unnamed_addr constant [4 x i8] c"foo\00" ; GCDA-NOT: @{{[0-9]+}} = private unnamed_addr constant .* c"bar\00" ; GCDA: @[[BAZ:[0-9]+]] = private unnamed_addr constant [4 x i8] c"baz\00" -; GCDA: @__llvm_internal_gcov_emit_function_args.0 = internal unnamed_addr constant -; GCDA-SAME: { i32 0, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @[[FOO]] -; GCDA-SAME: { i32 1, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @[[BAZ]] -; -; GCDA-LABEL: define internal void @__llvm_gcov_writeout() {{.*}} { -; GCDA-NEXT: entry: -; GCDA-NEXT: br label %[[FILE_LOOP_HEADER:.*]] -; -; GCDA: [[FILE_LOOP_HEADER]]: -; GCDA-NEXT: %[[IV:.*]] = phi i32 [ 0, %entry ], [ %[[NEXT_IV:.*]], %[[FILE_LOOP_LATCH:.*]] ] -; GCDA-NEXT: %[[FILE_INFO:.*]] = getelementptr inbounds {{.*}}, {{.*}}* @__llvm_internal_gcov_emit_file_info, i32 0, i32 %[[IV]] -; GCDA-NEXT: %[[START_FILE_ARGS:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[FILE_INFO]], i32 0, i32 0 -; GCDA-NEXT: %[[START_FILE_ARG_0_PTR:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[START_FILE_ARGS]], i32 0, i32 0 -; GCDA-NEXT: %[[START_FILE_ARG_0:.*]] = load i8*, i8** %[[START_FILE_ARG_0_PTR]] -; GCDA-NEXT: %[[START_FILE_ARG_1_PTR:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[START_FILE_ARGS]], i32 0, i32 1 -; GCDA-NEXT: %[[START_FILE_ARG_1:.*]] = load i8*, i8** %[[START_FILE_ARG_1_PTR]] -; GCDA-NEXT: %[[START_FILE_ARG_2_PTR:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[START_FILE_ARGS]], i32 0, i32 2 -; GCDA-NEXT: %[[START_FILE_ARG_2:.*]] = load i32, i32* %[[START_FILE_ARG_2_PTR]] -; GCDA-NEXT: call void @llvm_gcda_start_file(i8* %[[START_FILE_ARG_0]], i8* %[[START_FILE_ARG_1]], i32 %[[START_FILE_ARG_2]]) -; GCDA-NEXT: %[[NUM_COUNTERS_PTR:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[FILE_INFO]], i32 0, i32 1 -; GCDA-NEXT: %[[NUM_COUNTERS:.*]] = load i32, i32* %[[NUM_COUNTERS_PTR]] -; GCDA-NEXT: %[[EMIT_FUN_ARGS_ARRAY_PTR:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[FILE_INFO]], i32 0, i32 2 -; GCDA-NEXT: %[[EMIT_FUN_ARGS_ARRAY:.*]] = load {{.*}}*, {{.*}}** %[[EMIT_FUN_ARGS_ARRAY_PTR]] -; GCDA-NEXT: %[[EMIT_ARCS_ARGS_ARRAY_PTR:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[FILE_INFO]], i32 0, i32 3 -; GCDA-NEXT: %[[EMIT_ARCS_ARGS_ARRAY:.*]] = load {{.*}}*, {{.*}}** %[[EMIT_ARCS_ARGS_ARRAY_PTR]] -; GCDA-NEXT: %[[ENTER_COUNTER_LOOP_COND:.*]] = icmp slt i32 0, %[[NUM_COUNTERS]] -; GCDA-NEXT: br i1 %[[ENTER_COUNTER_LOOP_COND]], label %[[COUNTER_LOOP:.*]], label %[[FILE_LOOP_LATCH]] -; -; GCDA: [[COUNTER_LOOP]]: -; GCDA-NEXT: %[[JV:.*]] = phi i32 [ 0, %[[FILE_LOOP_HEADER]] ], [ %[[NEXT_JV:.*]], %[[COUNTER_LOOP]] ] -; GCDA-NEXT: %[[EMIT_FUN_ARGS:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[EMIT_FUN_ARGS_ARRAY]], i32 %[[JV]] -; GCDA-NEXT: %[[EMIT_FUN_ARG_0_PTR:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[EMIT_FUN_ARGS]], i32 0, i32 0 -; GCDA-NEXT: %[[EMIT_FUN_ARG_0:.*]] = load i32, i32* %[[EMIT_FUN_ARG_0_PTR]] -; GCDA-NEXT: %[[EMIT_FUN_ARG_1_PTR:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[EMIT_FUN_ARGS]], i32 0, i32 1 -; GCDA-NEXT: %[[EMIT_FUN_ARG_1:.*]] = load i8*, i8** %[[EMIT_FUN_ARG_1_PTR]] -; GCDA-NEXT: %[[EMIT_FUN_ARG_2_PTR:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[EMIT_FUN_ARGS]], i32 0, i32 2 -; GCDA-NEXT: %[[EMIT_FUN_ARG_2:.*]] = load i32, i32* %[[EMIT_FUN_ARG_2_PTR]] -; GCDA-NEXT: %[[EMIT_FUN_ARG_3_PTR:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[EMIT_FUN_ARGS]], i32 0, i32 3 -; GCDA-NEXT: %[[EMIT_FUN_ARG_3:.*]] = load i8, i8* %[[EMIT_FUN_ARG_3_PTR]] -; GCDA-NEXT: %[[EMIT_FUN_ARG_4_PTR:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[EMIT_FUN_ARGS]], i32 0, i32 4 -; GCDA-NEXT: %[[EMIT_FUN_ARG_4:.*]] = load i32, i32* %[[EMIT_FUN_ARG_4_PTR]] -; GCDA-NEXT: call void @llvm_gcda_emit_function(i32 %[[EMIT_FUN_ARG_0]], -; GCDA-SAME: i8* %[[EMIT_FUN_ARG_1]], -; GCDA-SAME: i32 %[[EMIT_FUN_ARG_2]], -; GCDA-SAME: i8 %[[EMIT_FUN_ARG_3]], -; GCDA-SAME: i32 %[[EMIT_FUN_ARG_4]]) -; GCDA-NEXT: %[[EMIT_ARCS_ARGS:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[EMIT_ARCS_ARGS_ARRAY]], i32 %[[JV]] -; GCDA-NEXT: %[[EMIT_ARCS_ARG_0_PTR:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[EMIT_ARCS_ARGS]], i32 0, i32 0 -; GCDA-NEXT: %[[EMIT_ARCS_ARG_0:.*]] = load i32, i32* %[[EMIT_ARCS_ARG_0_PTR]] -; GCDA-NEXT: %[[EMIT_ARCS_ARG_1_PTR:.*]] = getelementptr inbounds {{.*}}, {{.*}}* %[[EMIT_ARCS_ARGS]], i32 0, i32 1 -; GCDA-NEXT: %[[EMIT_ARCS_ARG_1:.*]] = load i64*, i64** %[[EMIT_ARCS_ARG_1_PTR]] -; GCDA-NEXT: call void @llvm_gcda_emit_arcs(i32 %[[EMIT_ARCS_ARG_0]], -; GCDA-SAME: i64* %[[EMIT_ARCS_ARG_1]]) -; GCDA-NEXT: %[[NEXT_JV]] = add i32 %[[JV]], 1 -; GCDA-NEXT: %[[COUNTER_LOOP_COND:.*]] = icmp slt i32 %[[NEXT_JV]], %[[NUM_COUNTERS]] -; GCDA-NEXT: br i1 %[[COUNTER_LOOP_COND]], label %[[COUNTER_LOOP]], label %[[FILE_LOOP_LATCH]] -; -; GCDA: [[FILE_LOOP_LATCH]]: -; GCDA-NEXT: call void @llvm_gcda_summary_info() -; GCDA-NEXT: call void @llvm_gcda_end_file() -; GCDA-NEXT: %[[NEXT_IV]] = add i32 %[[IV]], 1 -; GCDA-NEXT: %[[FILE_LOOP_COND:.*]] = icmp slt i32 %[[NEXT_IV]], 1 -; GCDA-NEXT: br i1 %[[FILE_LOOP_COND]], label %[[FILE_LOOP_HEADER]], label %[[EXIT:.*]] -; -; GCDA: [[EXIT]]: -; GCDA-NEXT: ret void +; GCDA: define internal void @__llvm_gcov_writeout() +; GCDA: call void @llvm_gcda_emit_function(i32 0, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @[[FOO]] +; GCDA: call void @llvm_gcda_emit_function(i32 1, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @[[BAZ]] ; GCNO: == foo (0) @ ; GCNO-NOT: == bar ({{[0-9]+}}) @ Index: test/Transforms/GVN/invariant.group.ll =================================================================== --- test/Transforms/GVN/invariant.group.ll +++ test/Transforms/GVN/invariant.group.ll @@ -25,7 +25,7 @@ entry: %ptr = alloca i8 store i8 42, i8* %ptr, !invariant.group !0 - %ptr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) + %ptr2 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) %a = load i8, i8* %ptr, !invariant.group !0 call void @foo(i8* %ptr2); call to use %ptr2 @@ -242,7 +242,7 @@ entry: %ptr = alloca i8 store i8 42, i8* %ptr, !invariant.group !0 - %ptr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) + %ptr2 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) ; CHECK-NOT: load %a = load i8, i8* %ptr2, !invariant.group !0 @@ -314,7 +314,7 @@ ; CHECK: store i8 %unknownValue, i8* %ptr, !invariant.group !0 store i8 %unknownValue, i8* %ptr, !invariant.group !0 - %newPtr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) + %newPtr2 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) ; CHECK-NOT: load %d = load i8, i8* %newPtr2, !invariant.group !0 ; CHECK: ret i8 %unknownValue @@ -441,7 +441,7 @@ declare void @_ZN1AC1Ev(%struct.A*) declare void @fooBit(i1*, i1) -declare i8* @llvm.launder.invariant.group.p0i8(i8*) +declare i8* @llvm.invariant.group.barrier.p0i8(i8*) ; Function Attrs: nounwind declare void @llvm.assume(i1 %cmp.vtables) #0 Index: test/Transforms/GlobalOpt/invariant.group.barrier.ll =================================================================== --- test/Transforms/GlobalOpt/invariant.group.barrier.ll +++ test/Transforms/GlobalOpt/invariant.group.barrier.ll @@ -33,7 +33,7 @@ store i32 %val, i32* %valptr %0 = bitcast i32* %valptr to i8* - %barr = call i8* @llvm.launder.invariant.group(i8* %0) + %barr = call i8* @llvm.invariant.group.barrier(i8* %0) %1 = bitcast i8* %barr to i32* %val2 = load i32, i32* %1 @@ -41,7 +41,7 @@ ret void } -; We can't step through launder.invariant.group here, because that would change +; We can't step through invariant.group.barrier here, because that would change ; this load in @usage_of_globals() ; val = load i32, i32* %ptrVal, !invariant.group !0 ; into @@ -54,7 +54,7 @@ store i32 13, i32* @tmp3, !invariant.group !0 %0 = bitcast i32* @tmp3 to i8* - %barr = call i8* @llvm.launder.invariant.group(i8* %0) + %barr = call i8* @llvm.invariant.group.barrier(i8* %0) %1 = bitcast i8* %barr to i32* store i32* %1, i32** @ptrToTmp3 @@ -74,6 +74,6 @@ declare void @changeTmp3ValAndCallBarrierInside() -declare i8* @llvm.launder.invariant.group(i8*) +declare i8* @llvm.invariant.group.barrier(i8*) !0 = !{!"something"} Index: test/Transforms/InstCombine/rem.ll =================================================================== --- test/Transforms/InstCombine/rem.ll +++ test/Transforms/InstCombine/rem.ll @@ -354,11 +354,12 @@ define i32 @test18(i16 %x, i32 %y) { ; CHECK-LABEL: @test18( -; CHECK-NEXT: [[TMP1:%.*]] = and i16 [[X:%.*]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 63, i32 31 -; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], [[Y:%.*]] -; CHECK-NEXT: ret i32 [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i16 [[X:%.*]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 32 +; CHECK-NEXT: [[TMP3:%.*]] = xor i16 [[TMP2]], 63 +; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], [[Y:%.*]] +; CHECK-NEXT: ret i32 [[TMP5]] ; %1 = and i16 %x, 4 %2 = icmp ne i16 %1, 0 Index: test/Transforms/InstCombine/select-icmp-and.ll =================================================================== --- test/Transforms/InstCombine/select-icmp-and.ll +++ test/Transforms/InstCombine/select-icmp-and.ll @@ -29,9 +29,10 @@ define i32 @test35(i32 %x) { ; CHECK-LABEL: @test35( -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], -1 -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 60, i32 100 -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 40 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 60 +; CHECK-NEXT: ret i32 [[TMP3]] ; %cmp = icmp sge i32 %x, 0 %cond = select i1 %cmp, i32 60, i32 100 @@ -40,9 +41,10 @@ define <2 x i32> @test35vec(<2 x i32> %x) { ; CHECK-LABEL: @test35vec( -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> , <2 x i32> -; CHECK-NEXT: ret <2 x i32> [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw <2 x i32> [[TMP2]], +; CHECK-NEXT: ret <2 x i32> [[TMP3]] ; %cmp = icmp sge <2 x i32> %x, %cond = select <2 x i1> %cmp, <2 x i32> , <2 x i32> @@ -53,9 +55,10 @@ define i32 @test35_with_trunc(i64 %x) { ; CHECK-LABEL: @test35_with_trunc( ; CHECK-NEXT: [[X1:%.*]] = trunc i64 [[X:%.*]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X1]], -1 -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 60, i32 100 -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X1]], 31 +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 40 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 60 +; CHECK-NEXT: ret i32 [[TMP3]] ; %x1 = trunc i64 %x to i32 %cmp = icmp sge i32 %x1, 0 @@ -65,9 +68,10 @@ define i32 @test36(i32 %x) { ; CHECK-LABEL: @test36( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 60, i32 100 -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -40 +; CHECK-NEXT: [[TMP3:%.*]] = add nsw i32 [[TMP2]], 100 +; CHECK-NEXT: ret i32 [[TMP3]] ; %cmp = icmp slt i32 %x, 0 %cond = select i1 %cmp, i32 60, i32 100 @@ -76,9 +80,10 @@ define <2 x i32> @test36vec(<2 x i32> %x) { ; CHECK-LABEL: @test36vec( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i32> [[X:%.*]], zeroinitializer -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> , <2 x i32> -; CHECK-NEXT: ret <2 x i32> [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], +; CHECK-NEXT: ret <2 x i32> [[TMP3]] ; %cmp = icmp slt <2 x i32> %x, %cond = select <2 x i1> %cmp, <2 x i32> , <2 x i32> @@ -87,9 +92,9 @@ define i32 @test37(i32 %x) { ; CHECK-LABEL: @test37( -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], -1 -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 1, i32 -1 -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 1 +; CHECK-NEXT: ret i32 [[TMP2]] ; %cmp = icmp sgt i32 %x, -1 %cond = select i1 %cmp, i32 1, i32 -1 @@ -98,9 +103,9 @@ define <2 x i32> @test37vec(<2 x i32> %x) { ; CHECK-LABEL: @test37vec( -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[COND:%.*]] = select <2 x i1> [[CMP]], <2 x i32> , <2 x i32> -; CHECK-NEXT: ret <2 x i32> [[COND]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP1]], +; CHECK-NEXT: ret <2 x i32> [[TMP2]] ; %cmp = icmp sgt <2 x i32> %x, %cond = select <2 x i1> %cmp, <2 x i32> , <2 x i32> @@ -109,10 +114,11 @@ define i32 @test65(i64 %x) { ; CHECK-LABEL: @test65( -; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[X:%.*]], 16 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 42, i32 40 -; CHECK-NEXT: ret i32 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[X:%.*]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP3]], 42 +; CHECK-NEXT: ret i32 [[TMP4]] ; %1 = and i64 %x, 16 %2 = icmp ne i64 %1, 0 @@ -122,10 +128,11 @@ define <2 x i32> @test65vec(<2 x i64> %x) { ; CHECK-LABEL: @test65vec( -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> , <2 x i32> -; CHECK-NEXT: ret <2 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i64> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i32> [[TMP3]], +; CHECK-NEXT: ret <2 x i32> [[TMP4]] ; %1 = and <2 x i64> %x, %2 = icmp ne <2 x i64> %1, zeroinitializer @@ -135,10 +142,11 @@ define i32 @test66(i64 %x) { ; CHECK-LABEL: @test66( -; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[X:%.*]], 4294967296 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 42, i32 40 -; CHECK-NEXT: ret i32 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[X:%.*]], 31 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP3]], 42 +; CHECK-NEXT: ret i32 [[TMP4]] ; %1 = and i64 %x, 4294967296 %2 = icmp ne i64 %1, 0 @@ -148,10 +156,11 @@ define <2 x i32> @test66vec(<2 x i64> %x) { ; CHECK-LABEL: @test66vec( -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> , <2 x i32> -; CHECK-NEXT: ret <2 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i64> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i32> [[TMP3]], +; CHECK-NEXT: ret <2 x i32> [[TMP4]] ; %1 = and <2 x i64> %x, %2 = icmp ne <2 x i64> %1, zeroinitializer @@ -175,10 +184,11 @@ define i32 @test67(i16 %x) { ; CHECK-LABEL: @test67( -; CHECK-NEXT: [[TMP1:%.*]] = and i16 [[X:%.*]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 42, i32 40 -; CHECK-NEXT: ret i32 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = lshr i16 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = xor i16 [[TMP2]], 42 +; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32 +; CHECK-NEXT: ret i32 [[TMP4]] ; %1 = and i16 %x, 4 %2 = icmp ne i16 %1, 0 @@ -188,10 +198,11 @@ define <2 x i32> @test67vec(<2 x i16> %x) { ; CHECK-LABEL: @test67vec( -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i16> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i16> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> , <2 x i32> -; CHECK-NEXT: ret <2 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i16> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i16> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i16> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[TMP4]] ; %1 = and <2 x i16> %x, %2 = icmp ne <2 x i16> %1, zeroinitializer @@ -201,9 +212,9 @@ define i32 @test71(i32 %x) { ; CHECK-LABEL: @test71( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], 128 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 42, i32 40 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 6 +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 42 ; CHECK-NEXT: ret i32 [[TMP3]] ; %1 = and i32 %x, 128 @@ -214,9 +225,9 @@ define <2 x i32> @test71vec(<2 x i32> %x) { ; CHECK-LABEL: @test71vec( -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> , <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[TMP2]], ; CHECK-NEXT: ret <2 x i32> [[TMP3]] ; %1 = and <2 x i32> %x, @@ -227,9 +238,9 @@ define i32 @test72(i32 %x) { ; CHECK-LABEL: @test72( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], 128 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 40, i32 42 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 6 +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP2]], 40 ; CHECK-NEXT: ret i32 [[TMP3]] ; %1 = and i32 %x, 128 @@ -240,9 +251,9 @@ define <2 x i32> @test72vec(<2 x i32> %x) { ; CHECK-LABEL: @test72vec( -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> , <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[TMP2]], ; CHECK-NEXT: ret <2 x i32> [[TMP3]] ; %1 = and <2 x i32> %x, @@ -253,9 +264,9 @@ define i32 @test73(i32 %x) { ; CHECK-LABEL: @test73( -; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i8 -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt i8 [[TMP1]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 40, i32 42 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 6 +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP2]], 40 ; CHECK-NEXT: ret i32 [[TMP3]] ; %1 = trunc i32 %x to i8 @@ -266,9 +277,9 @@ define <2 x i32> @test73vec(<2 x i32> %x) { ; CHECK-LABEL: @test73vec( -; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i8> -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i8> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> , <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[TMP2]], ; CHECK-NEXT: ret <2 x i32> [[TMP3]] ; %1 = trunc <2 x i32> %x to <2 x i8> @@ -279,9 +290,10 @@ define i32 @test74(i32 %x) { ; CHECK-LABEL: @test74( -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[X:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 40, i32 42 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP2]], 40 +; CHECK-NEXT: ret i32 [[TMP3]] ; %1 = icmp sgt i32 %x, -1 %2 = select i1 %1, i32 40, i32 42 @@ -290,9 +302,10 @@ define <2 x i32> @test74vec(<2 x i32> %x) { ; CHECK-LABEL: @test74vec( -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> , <2 x i32> -; CHECK-NEXT: ret <2 x i32> [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[TMP2]], +; CHECK-NEXT: ret <2 x i32> [[TMP3]] ; %1 = icmp sgt <2 x i32> %x, %2 = select <2 x i1> %1, <2 x i32> , <2 x i32> @@ -390,9 +403,9 @@ ;; (a & 8) ? -9 : -1 define i32 @test15h(i32 %X) { ; CHECK-LABEL: @test15h( -; CHECK-NEXT: [[T1:%.*]] = and i32 [[X:%.*]], 8 -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[T1]], -1 -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[X:%.*]], -9 +; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 8 +; CHECK-NEXT: ret i32 [[TMP2]] ; %t1 = and i32 %X, 8 %t2 = icmp ne i32 %t1, 0 @@ -403,10 +416,11 @@ ;; (a & 2) ? 577 : 1089 define i32 @test15i(i32 %X) { ; CHECK-LABEL: @test15i( -; CHECK-NEXT: [[T1:%.*]] = and i32 [[X:%.*]], 2 -; CHECK-NEXT: [[T2:%.*]] = icmp eq i32 [[T1]], 0 -; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2]], i32 1089, i32 577 -; CHECK-NEXT: ret i32 [[T3]] +; CHECK-NEXT: [[T1:%.*]] = shl i32 [[X:%.*]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[T1]], 512 +; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 512 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 577 +; CHECK-NEXT: ret i32 [[TMP3]] ; %t1 = and i32 %X, 2 %t2 = icmp ne i32 %t1, 0 @@ -417,10 +431,10 @@ ;; (a & 2) ? 1089 : 577 define i32 @test15j(i32 %X) { ; CHECK-LABEL: @test15j( -; CHECK-NEXT: [[T1:%.*]] = and i32 [[X:%.*]], 2 -; CHECK-NEXT: [[T2:%.*]] = icmp eq i32 [[T1]], 0 -; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2]], i32 577, i32 1089 -; CHECK-NEXT: ret i32 [[T3]] +; CHECK-NEXT: [[T1:%.*]] = shl i32 [[X:%.*]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[T1]], 512 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 577 +; CHECK-NEXT: ret i32 [[TMP2]] ; %t1 = and i32 %X, 2 %t2 = icmp ne i32 %t1, 0 @@ -507,7 +521,7 @@ define i8 @clear_to_set_decomposebittest(i8 %x) { ; CHECK-LABEL: @clear_to_set_decomposebittest( ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], -128 -; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[TMP1]], -125 +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], -125 ; CHECK-NEXT: ret i8 [[TMP2]] ; %t2 = icmp sgt i8 %x, -1 @@ -546,7 +560,7 @@ define i8 @set_to_clear_decomposebittest(i8 %x) { ; CHECK-LABEL: @set_to_clear_decomposebittest( ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], -128 -; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[TMP1]], -125 +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], -125 ; CHECK-NEXT: ret i8 [[TMP2]] ; %t2 = icmp slt i8 %x, 0 @@ -560,9 +574,10 @@ define i8 @clear_to_set_decomposebittest_extra_use(i8 %x) { ; CHECK-LABEL: @clear_to_set_decomposebittest_extra_use( ; CHECK-NEXT: [[T2:%.*]] = icmp sgt i8 [[X:%.*]], -1 -; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2]], i8 -125, i8 3 +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], -128 +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], -125 ; CHECK-NEXT: call void @use1(i1 [[T2]]) -; CHECK-NEXT: ret i8 [[T3]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %t2 = icmp sgt i8 %x, -1 %t3 = select i1 %t2, i8 131, i8 3 @@ -576,9 +591,10 @@ define i8 @clear_to_clear_decomposebittest_extra_use(i8 %x) { ; CHECK-LABEL: @clear_to_clear_decomposebittest_extra_use( ; CHECK-NEXT: [[T2:%.*]] = icmp sgt i8 [[X:%.*]], -1 -; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2]], i8 3, i8 -125 +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], -128 +; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[TMP1]], 3 ; CHECK-NEXT: call void @use1(i1 [[T2]]) -; CHECK-NEXT: ret i8 [[T3]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %t2 = icmp sgt i8 %x, -1 %t3 = select i1 %t2, i8 3, i8 131 @@ -592,9 +608,10 @@ define i8 @set_to_set_decomposebittest_extra_use(i8 %x) { ; CHECK-LABEL: @set_to_set_decomposebittest_extra_use( ; CHECK-NEXT: [[T2:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2]], i8 -125, i8 3 +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], -128 +; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[TMP1]], 3 ; CHECK-NEXT: call void @use1(i1 [[T2]]) -; CHECK-NEXT: ret i8 [[T3]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %t2 = icmp slt i8 %x, 0 %t3 = select i1 %t2, i8 131, i8 3 @@ -608,9 +625,10 @@ define i8 @set_to_clear_decomposebittest_extra_use(i8 %x) { ; CHECK-LABEL: @set_to_clear_decomposebittest_extra_use( ; CHECK-NEXT: [[T2:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2]], i8 3, i8 -125 +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], -128 +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], -125 ; CHECK-NEXT: call void @use1(i1 [[T2]]) -; CHECK-NEXT: ret i8 [[T3]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %t2 = icmp slt i8 %x, 0 %t3 = select i1 %t2, i8 3, i8 131 Index: test/Transforms/InstCombine/unrecognized_three-way-comparison.ll =================================================================== --- test/Transforms/InstCombine/unrecognized_three-way-comparison.ll +++ test/Transforms/InstCombine/unrecognized_three-way-comparison.ll @@ -43,10 +43,14 @@ define i32 @compare_against_zero(i32 %x) { ; CHECK-LABEL: @compare_against_zero( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[X:%.*]], 0 -; CHECK-NEXT: br i1 [[TMP0]], label [[CALLFOO:%.*]], label [[EXIT:%.*]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = ashr i32 [[X]], 31 +; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[TMP0]], 1 +; CHECK-NEXT: [[SELECT2:%.*]] = select i1 [[CMP1]], i32 0, i32 [[TMP1]] +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[SELECT2]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[CALLFOO:%.*]], label [[EXIT:%.*]] ; CHECK: callfoo: -; CHECK-NEXT: call void @foo(i32 1) +; CHECK-NEXT: call void @foo(i32 [[SELECT2]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret i32 42 @@ -269,10 +273,15 @@ define i32 @compare_against_zero_non_idiomatic_add(i32 %x) { ; CHECK-LABEL: @compare_against_zero_non_idiomatic_add( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[X:%.*]], 0 -; CHECK-NEXT: br i1 [[TMP0]], label [[CALLFOO:%.*]], label [[EXIT:%.*]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = ashr i32 [[X]], 31 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], -431 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw i32 [[TMP1]], 425 +; CHECK-NEXT: [[SELECT2:%.*]] = select i1 [[CMP1]], i32 0, i32 [[TMP2]] +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[SELECT2]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[CALLFOO:%.*]], label [[EXIT:%.*]] ; CHECK: callfoo: -; CHECK-NEXT: call void @foo(i32 425) +; CHECK-NEXT: call void @foo(i32 [[SELECT2]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret i32 42 @@ -327,10 +336,15 @@ define i32 @compare_against_zero_non_idiomatic_or(i32 %x) { ; CHECK-LABEL: @compare_against_zero_non_idiomatic_or( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[X:%.*]], 0 -; CHECK-NEXT: br i1 [[TMP0]], label [[CALLFOO:%.*]], label [[EXIT:%.*]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = ashr i32 [[X]], 31 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], -430 +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 425 +; CHECK-NEXT: [[SELECT2:%.*]] = select i1 [[CMP1]], i32 0, i32 [[TMP2]] +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[SELECT2]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[CALLFOO:%.*]], label [[EXIT:%.*]] ; CHECK: callfoo: -; CHECK-NEXT: call void @foo(i32 425) +; CHECK-NEXT: call void @foo(i32 [[SELECT2]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret i32 42 @@ -388,10 +402,17 @@ define i32 @compare_against_zero_type_mismatch_idiomatic(i64 %x) { ; CHECK-LABEL: @compare_against_zero_type_mismatch_idiomatic( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[X:%.*]], 0 -; CHECK-NEXT: br i1 [[TMP0]], label [[CALLFOO:%.*]], label [[EXIT:%.*]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 [[X:%.*]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[X]], 62 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw i32 [[TMP3]], -1 +; CHECK-NEXT: [[SELECT2:%.*]] = select i1 [[CMP1]], i32 0, i32 [[TMP4]] +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[SELECT2]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[CALLFOO:%.*]], label [[EXIT:%.*]] ; CHECK: callfoo: -; CHECK-NEXT: call void @foo(i32 1) +; CHECK-NEXT: call void @foo(i32 [[SELECT2]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret i32 42 @@ -416,10 +437,17 @@ define i32 @compare_against_zero_type_mismatch_non_idiomatic_1(i64 %x) { ; CHECK-LABEL: @compare_against_zero_type_mismatch_non_idiomatic_1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[X:%.*]], 0 -; CHECK-NEXT: br i1 [[TMP0]], label [[CALLFOO:%.*]], label [[EXIT:%.*]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 [[X:%.*]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[X]], 60 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw i32 [[TMP3]], -7 +; CHECK-NEXT: [[SELECT2:%.*]] = select i1 [[CMP1]], i32 0, i32 [[TMP4]] +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[SELECT2]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[CALLFOO:%.*]], label [[EXIT:%.*]] ; CHECK: callfoo: -; CHECK-NEXT: call void @foo(i32 1) +; CHECK-NEXT: call void @foo(i32 [[SELECT2]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret i32 42 Index: test/Transforms/LoopIdiom/X86/ctlz.ll =================================================================== --- test/Transforms/LoopIdiom/X86/ctlz.ll +++ test/Transforms/LoopIdiom/X86/ctlz.ll @@ -183,85 +183,3 @@ while.end: ; preds = %while.cond ret i32 %i.0 } - -; This loop contains a volatile store. If x is initially negative, -; the code will be an infinite loop because the ashr will eventually produce -; all ones and continue doing so. This prevents the loop from terminating. If -; we convert this to a countable loop using ctlz that loop will only run 32 -; times. This is different than the infinite number of times of the original. -; FIXME: Don't transform this loop. -define i32 @foo(i32 %x) { -; LZCNT-LABEL: @foo( -; LZCNT-NEXT: entry: -; LZCNT-NEXT: [[V:%.*]] = alloca i8, align 1 -; LZCNT-NEXT: [[TOBOOL4:%.*]] = icmp eq i32 [[X:%.*]], 0 -; LZCNT-NEXT: br i1 [[TOBOOL4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_LR_PH:%.*]] -; LZCNT: while.body.lr.ph: -; LZCNT-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[X]], i1 true) -; LZCNT-NEXT: [[TMP1:%.*]] = sub i32 32, [[TMP0]] -; LZCNT-NEXT: br label [[WHILE_BODY:%.*]] -; LZCNT: while.body: -; LZCNT-NEXT: [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_LR_PH]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ] -; LZCNT-NEXT: [[CNT_06:%.*]] = phi i32 [ 0, [[WHILE_BODY_LR_PH]] ], [ [[INC:%.*]], [[WHILE_BODY]] ] -; LZCNT-NEXT: [[X_ADDR_05:%.*]] = phi i32 [ [[X]], [[WHILE_BODY_LR_PH]] ], [ [[SHR:%.*]], [[WHILE_BODY]] ] -; LZCNT-NEXT: [[SHR]] = ashr i32 [[X_ADDR_05]], 1 -; LZCNT-NEXT: [[INC]] = add i32 [[CNT_06]], 1 -; LZCNT-NEXT: store volatile i8 42, i8* [[V]], align 1 -; LZCNT-NEXT: [[TCDEC]] = sub nsw i32 [[TCPHI]], 1 -; LZCNT-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0 -; LZCNT-NEXT: br i1 [[TOBOOL]], label [[WHILE_COND_WHILE_END_CRIT_EDGE:%.*]], label [[WHILE_BODY]] -; LZCNT: while.cond.while.end_crit_edge: -; LZCNT-NEXT: [[SPLIT:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY]] ] -; LZCNT-NEXT: br label [[WHILE_END]] -; LZCNT: while.end: -; LZCNT-NEXT: [[CNT_0_LCSSA:%.*]] = phi i32 [ [[SPLIT]], [[WHILE_COND_WHILE_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] -; LZCNT-NEXT: ret i32 [[CNT_0_LCSSA]] -; -; NOLZCNT-LABEL: @foo( -; NOLZCNT-NEXT: entry: -; NOLZCNT-NEXT: [[V:%.*]] = alloca i8, align 1 -; NOLZCNT-NEXT: [[TOBOOL4:%.*]] = icmp eq i32 [[X:%.*]], 0 -; NOLZCNT-NEXT: br i1 [[TOBOOL4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_LR_PH:%.*]] -; NOLZCNT: while.body.lr.ph: -; NOLZCNT-NEXT: br label [[WHILE_BODY:%.*]] -; NOLZCNT: while.body: -; NOLZCNT-NEXT: [[CNT_06:%.*]] = phi i32 [ 0, [[WHILE_BODY_LR_PH]] ], [ [[INC:%.*]], [[WHILE_BODY]] ] -; NOLZCNT-NEXT: [[X_ADDR_05:%.*]] = phi i32 [ [[X]], [[WHILE_BODY_LR_PH]] ], [ [[SHR:%.*]], [[WHILE_BODY]] ] -; NOLZCNT-NEXT: [[SHR]] = ashr i32 [[X_ADDR_05]], 1 -; NOLZCNT-NEXT: [[INC]] = add i32 [[CNT_06]], 1 -; NOLZCNT-NEXT: store volatile i8 42, i8* [[V]], align 1 -; NOLZCNT-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[SHR]], 0 -; NOLZCNT-NEXT: br i1 [[TOBOOL]], label [[WHILE_COND_WHILE_END_CRIT_EDGE:%.*]], label [[WHILE_BODY]] -; NOLZCNT: while.cond.while.end_crit_edge: -; NOLZCNT-NEXT: [[SPLIT:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ] -; NOLZCNT-NEXT: br label [[WHILE_END]] -; NOLZCNT: while.end: -; NOLZCNT-NEXT: [[CNT_0_LCSSA:%.*]] = phi i32 [ [[SPLIT]], [[WHILE_COND_WHILE_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] -; NOLZCNT-NEXT: ret i32 [[CNT_0_LCSSA]] -; -entry: - %v = alloca i8, align 1 - %tobool4 = icmp eq i32 %x, 0 - br i1 %tobool4, label %while.end, label %while.body.lr.ph - -while.body.lr.ph: ; preds = %entry - br label %while.body - -while.body: ; preds = %while.body.lr.ph, %while.body - %cnt.06 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %while.body ] - %x.addr.05 = phi i32 [ %x, %while.body.lr.ph ], [ %shr, %while.body ] - %shr = ashr i32 %x.addr.05, 1 - %inc = add i32 %cnt.06, 1 - store volatile i8 42, i8* %v, align 1 - %tobool = icmp eq i32 %shr, 0 - br i1 %tobool, label %while.cond.while.end_crit_edge, label %while.body - -while.cond.while.end_crit_edge: ; preds = %while.body - %split = phi i32 [ %inc, %while.body ] - br label %while.end - -while.end: ; preds = %while.cond.while.end_crit_edge, %entry - %cnt.0.lcssa = phi i32 [ %split, %while.cond.while.end_crit_edge ], [ 0, %entry ] - ret i32 %cnt.0.lcssa -} - Index: test/Transforms/LoopIdiom/X86/popcnt.ll =================================================================== --- test/Transforms/LoopIdiom/X86/popcnt.ll +++ test/Transforms/LoopIdiom/X86/popcnt.ll @@ -138,45 +138,3 @@ %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ] ret i32 %c.0.lcssa } - -; The a & (a - 1) in the loop is a & (b - 1) in this code. Make sure we don't -; convert it. -define i32 @popcount_bad(i64 %a, i64 %b) nounwind uwtable readnone ssp { -; CHECK-LABEL: @popcount_bad( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TOBOOL3:%.*]] = icmp eq i64 [[A:%.*]], 0 -; CHECK-NEXT: br i1 [[TOBOOL3]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] -; CHECK: while.body.preheader: -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[C_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[A_ADDR_04:%.*]] = phi i64 [ [[AND:%.*]], [[WHILE_BODY]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[INC]] = add nsw i32 [[C_05]], 1 -; CHECK-NEXT: [[SUB:%.*]] = add i64 [[B:%.*]], -1 -; CHECK-NEXT: [[AND]] = and i64 [[SUB]], [[A_ADDR_04]] -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[AND]], 0 -; CHECK-NEXT: br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ] -; CHECK-NEXT: br label [[WHILE_END]] -; CHECK: while.end: -; CHECK-NEXT: [[C_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ] -; CHECK-NEXT: ret i32 [[C_0_LCSSA]] -; -entry: - %tobool3 = icmp eq i64 %a, 0 - br i1 %tobool3, label %while.end, label %while.body - -while.body: ; preds = %entry, %while.body - %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] - %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ] - %inc = add nsw i32 %c.05, 1 - %sub = add i64 %b, -1 - %and = and i64 %sub, %a.addr.04 - %tobool = icmp eq i64 %and, 0 - br i1 %tobool, label %while.end, label %while.body - -while.end: ; preds = %while.body, %entry - %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ] - ret i32 %c.0.lcssa -} Index: test/Transforms/LoopSimplify/preserve-scev.ll =================================================================== --- test/Transforms/LoopSimplify/preserve-scev.ll +++ test/Transforms/LoopSimplify/preserve-scev.ll @@ -95,7 +95,7 @@ ; CHECK: Loop %while.cond191: max backedge-taken count is 0 ; CHECK: Loop %while.cond191: Predicated backedge-taken count is 0 ; CHECK: Loop %while.cond191.outer: Unpredictable backedge-taken count. -; CHECK: Loop %while.cond191.outer: max backedge-taken count is false +; CHECK: Loop %while.cond191.outer: Unpredictable max backedge-taken count. ; CHECK: Loop %while.cond191.outer: Unpredictable predicated backedge-taken count. define void @mergeExit(i32 %MapAttrCount) nounwind uwtable ssp { entry: Index: test/Transforms/NewGVN/invariant.group.ll =================================================================== --- test/Transforms/NewGVN/invariant.group.ll +++ test/Transforms/NewGVN/invariant.group.ll @@ -26,7 +26,7 @@ entry: %ptr = alloca i8 store i8 42, i8* %ptr, !invariant.group !0 - %ptr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) + %ptr2 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) %a = load i8, i8* %ptr, !invariant.group !0 call void @foo(i8* %ptr2); call to use %ptr2 @@ -243,7 +243,8 @@ entry: %ptr = alloca i8 store i8 42, i8* %ptr, !invariant.group !0 - %ptr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) + %ptr2 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) +; CHECK-NOT: load %a = load i8, i8* %ptr2, !invariant.group !0 ; CHECK: ret i8 42 @@ -314,7 +315,7 @@ ; CHECK: store i8 %unknownValue, i8* %ptr, !invariant.group !0 store i8 %unknownValue, i8* %ptr, !invariant.group !0 - %newPtr2 = call i8* @llvm.launder.invariant.group.p0i8(i8* %ptr) + %newPtr2 = call i8* @llvm.invariant.group.barrier.p0i8(i8* %ptr) ; CHECK-NOT: load %d = load i8, i8* %newPtr2, !invariant.group !0 ; CHECK: ret i8 %unknownValue @@ -441,7 +442,7 @@ declare void @_ZN1AC1Ev(%struct.A*) declare void @fooBit(i1*, i1) -declare i8* @llvm.launder.invariant.group.p0i8(i8*) +declare i8* @llvm.invariant.group.barrier.p0i8(i8*) ; Function Attrs: nounwind declare void @llvm.assume(i1 %cmp.vtables) #0 Index: test/tools/llvm-cvtres/symbols.test =================================================================== --- test/tools/llvm-cvtres/symbols.test +++ test/tools/llvm-cvtres/symbols.test @@ -10,10 +10,6 @@ RUN: llvm-cvtres /verbose /out:%t %p/Inputs/test_resource.res RUN: llvm-readobj -symbols %t | FileCheck %s -// Test that parameters can be preceded by '-' in addition to '/': -RUN: llvm-cvtres -verbose -machine:X86 -out:%t %p/Inputs/test_resource.res -RUN: llvm-readobj -symbols %t | FileCheck %s - CHECK: Name: $R000000 CHECK-NEXT: Value: 0 CHECK-NEXT: Section: .rsrc$02 Index: test/tools/llvm-mca/X86/SLM/resources-sse2.s =================================================================== --- test/tools/llvm-mca/X86/SLM/resources-sse2.s +++ test/tools/llvm-mca/X86/SLM/resources-sse2.s @@ -679,7 +679,7 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] -# CHECK-NEXT: - 412.00 8.00 154.00 92.00 4.50 4.50 131.00 +# CHECK-NEXT: - 343.00 8.00 223.00 92.00 4.50 4.50 131.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] Instructions: @@ -932,7 +932,7 @@ # CHECK-NEXT: - - - 1.00 - - - - shufpd $1, %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 - - - 1.00 shufpd $1, (%rax), %xmm2 # CHECK-NEXT: - 70.00 - 1.00 - - - - sqrtpd %xmm0, %xmm2 -# CHECK-NEXT: - 70.00 - 1.00 - - - 1.00 sqrtpd (%rax), %xmm2 +# CHECK-NEXT: - 1.00 - 70.00 - - - 1.00 sqrtpd (%rax), %xmm2 # CHECK-NEXT: - 35.00 - 1.00 - - - - sqrtsd %xmm0, %xmm2 # CHECK-NEXT: - 35.00 - 1.00 - - - 1.00 sqrtsd (%rax), %xmm2 # CHECK-NEXT: - - - - 1.00 - - - subpd %xmm0, %xmm2 Index: test/tools/llvm-mca/X86/SandyBridge/resources-mmx.s =================================================================== --- test/tools/llvm-mca/X86/SandyBridge/resources-mmx.s +++ test/tools/llvm-mca/X86/SandyBridge/resources-mmx.s @@ -210,11 +210,11 @@ # CHECK-NEXT: 1 3 1.00 pcmpgtw %mm0, %mm2 # CHECK-NEXT: 2 8 1.00 * pcmpgtw (%rax), %mm2 # CHECK-NEXT: 1 5 1.00 pmaddwd %mm0, %mm2 -# CHECK-NEXT: 2 10 1.00 * pmaddwd (%rax), %mm2 +# CHECK-NEXT: 2 11 1.00 * pmaddwd (%rax), %mm2 # CHECK-NEXT: 1 5 1.00 pmulhw %mm0, %mm2 -# CHECK-NEXT: 2 10 1.00 * pmulhw (%rax), %mm2 +# CHECK-NEXT: 2 11 1.00 * pmulhw (%rax), %mm2 # CHECK-NEXT: 1 5 1.00 pmullw %mm0, %mm2 -# CHECK-NEXT: 2 10 1.00 * pmullw (%rax), %mm2 +# CHECK-NEXT: 2 11 1.00 * pmullw (%rax), %mm2 # CHECK-NEXT: 1 1 0.33 por %mm0, %mm2 # CHECK-NEXT: 2 6 0.50 * por (%rax), %mm2 # CHECK-NEXT: 1 1 1.00 pslld $1, %mm2 Index: test/tools/llvm-mca/X86/SandyBridge/resources-sse1.s =================================================================== --- test/tools/llvm-mca/X86/SandyBridge/resources-sse1.s +++ test/tools/llvm-mca/X86/SandyBridge/resources-sse1.s @@ -281,13 +281,13 @@ # CHECK-NEXT: 2 8 1.00 * pminub (%rax), %mm2 # CHECK-NEXT: 1 2 1.00 pmovmskb %xmm0, %ecx # CHECK-NEXT: 1 5 1.00 pmulhuw %mm0, %mm2 -# CHECK-NEXT: 2 10 1.00 * pmulhuw (%rax), %mm2 +# CHECK-NEXT: 2 11 1.00 * pmulhuw (%rax), %mm2 # CHECK-NEXT: 1 5 0.50 * * prefetcht0 (%rax) # CHECK-NEXT: 1 5 0.50 * * prefetcht1 (%rax) # CHECK-NEXT: 1 5 0.50 * * prefetcht2 (%rax) # CHECK-NEXT: 1 5 0.50 * * prefetchnta (%rax) # CHECK-NEXT: 1 5 1.00 psadbw %mm0, %mm2 -# CHECK-NEXT: 2 10 1.00 * psadbw (%rax), %mm2 +# CHECK-NEXT: 2 11 1.00 * psadbw (%rax), %mm2 # CHECK-NEXT: 1 1 1.00 pshufw $1, %mm0, %mm2 # CHECK-NEXT: 2 6 1.00 * pshufw $1, (%rax), %mm2 # CHECK-NEXT: 1 5 1.00 rcpps %xmm0, %xmm2 Index: test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s =================================================================== --- test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s +++ test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s @@ -573,7 +573,7 @@ # CHECK-NEXT: 1 5 1.00 pmullw %xmm0, %xmm2 # CHECK-NEXT: 2 11 1.00 * pmullw (%rax), %xmm2 # CHECK-NEXT: 1 5 1.00 pmuludq %mm0, %mm2 -# CHECK-NEXT: 2 10 1.00 * pmuludq (%rax), %mm2 +# CHECK-NEXT: 2 11 1.00 * pmuludq (%rax), %mm2 # CHECK-NEXT: 1 5 1.00 pmuludq %xmm0, %xmm2 # CHECK-NEXT: 2 11 1.00 * pmuludq (%rax), %xmm2 # CHECK-NEXT: 1 1 0.33 por %xmm0, %xmm2 Index: test/tools/llvm-mca/X86/SandyBridge/resources-ssse3.s =================================================================== --- test/tools/llvm-mca/X86/SandyBridge/resources-ssse3.s +++ test/tools/llvm-mca/X86/SandyBridge/resources-ssse3.s @@ -147,11 +147,11 @@ # CHECK-NEXT: 3 3 1.50 phsubw %xmm0, %xmm2 # CHECK-NEXT: 4 9 1.50 * phsubw (%rax), %xmm2 # CHECK-NEXT: 1 5 1.00 pmaddubsw %mm0, %mm2 -# CHECK-NEXT: 2 10 1.00 * pmaddubsw (%rax), %mm2 +# CHECK-NEXT: 2 11 1.00 * pmaddubsw (%rax), %mm2 # CHECK-NEXT: 1 5 1.00 pmaddubsw %xmm0, %xmm2 # CHECK-NEXT: 2 11 1.00 * pmaddubsw (%rax), %xmm2 # CHECK-NEXT: 1 5 1.00 pmulhrsw %mm0, %mm2 -# CHECK-NEXT: 2 10 1.00 * pmulhrsw (%rax), %mm2 +# CHECK-NEXT: 2 11 1.00 * pmulhrsw (%rax), %mm2 # CHECK-NEXT: 1 5 1.00 pmulhrsw %xmm0, %xmm2 # CHECK-NEXT: 2 11 1.00 * pmulhrsw (%rax), %xmm2 # CHECK-NEXT: 1 1 0.50 pshufb %mm0, %mm2 Index: test/tools/llvm-mca/X86/Znver1/resources-ssse3.s =================================================================== --- test/tools/llvm-mca/X86/Znver1/resources-ssse3.s +++ test/tools/llvm-mca/X86/Znver1/resources-ssse3.s @@ -139,7 +139,7 @@ # CHECK-NEXT: 1 100 - phsubd %xmm0, %xmm2 # CHECK-NEXT: 1 100 - * phsubd (%rax), %xmm2 # CHECK-NEXT: 1 100 - phsubsw %mm0, %mm2 -# CHECK-NEXT: 1 100 - * phsubsw (%rax), %mm2 +# CHECK-NEXT: 1 8 0.50 * phsubsw (%rax), %mm2 # CHECK-NEXT: 1 100 - phsubsw %xmm0, %xmm2 # CHECK-NEXT: 1 100 - * phsubsw (%rax), %xmm2 # CHECK-NEXT: 1 100 - phsubw %mm0, %mm2 @@ -187,7 +187,7 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] -# CHECK-NEXT: 10.00 10.00 - - - - - 16.00 8.00 8.00 8.00 - +# CHECK-NEXT: 10.50 10.50 - - - - - 16.25 8.25 8.25 8.25 - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions: @@ -224,7 +224,7 @@ # CHECK-NEXT: - - - - - - - - - - - - phsubd %xmm0, %xmm2 # CHECK-NEXT: - - - - - - - - - - - - phsubd (%rax), %xmm2 # CHECK-NEXT: - - - - - - - - - - - - phsubsw %mm0, %mm2 -# CHECK-NEXT: - - - - - - - - - - - - phsubsw (%rax), %mm2 +# CHECK-NEXT: 0.50 0.50 - - - - - 0.25 0.25 0.25 0.25 - phsubsw (%rax), %mm2 # CHECK-NEXT: - - - - - - - - - - - - phsubsw %xmm0, %xmm2 # CHECK-NEXT: - - - - - - - - - - - - phsubsw (%rax), %xmm2 # CHECK-NEXT: - - - - - - - - - - - - phsubw %mm0, %mm2 Index: test/tools/llvm-rc/helpmsg.test =================================================================== --- test/tools/llvm-rc/helpmsg.test +++ test/tools/llvm-rc/helpmsg.test @@ -7,7 +7,6 @@ ; CHECK-DAG: USAGE: rc [options] ; CHECK-DAG: OPTIONS: ; CHECK-NEXT: /? Display this help and exit. -; CHECK-NEXT: /C Set the codepage used for input strings. ; CHECK-NEXT: /dry-run Don't compile the input; only try to parse it. ; CHECK-NEXT: /D Define a symbol for the C preprocessor. ; CHECK-NEXT: /FO Change the output file location. Index: test/tools/llvm-rc/tag-menu.test =================================================================== --- test/tools/llvm-rc/tag-menu.test +++ test/tools/llvm-rc/tag-menu.test @@ -1,11 +1,6 @@ ; RUN: llvm-rc /FO %t %p/Inputs/tag-menu.rc ; RUN: llvm-readobj %t | FileCheck %s --check-prefix=MENU -; Test running llvm-rc without an explicit output file. -; RUN: cp %p/Inputs/tag-menu.rc %t.implicit.rc -; RUN: llvm-rc %t.implicit.rc -; RUN: llvm-readobj %t.implicit.res | FileCheck --check-prefix=MENU %s - ; MENU: Resource type (int): 4 ; MENU-NEXT: Resource name (string): CHECKRECURSION ; MENU-NEXT: Data version: 0 Index: test/tools/llvm-rc/tokenizer.test =================================================================== --- test/tools/llvm-rc/tokenizer.test +++ test/tools/llvm-rc/tokenizer.test @@ -1,4 +1,4 @@ -; RUN: not llvm-rc /V /FO %t.res %p/Inputs/tokens.rc | FileCheck %s +; RUN: not llvm-rc /V %p/Inputs/tokens.rc | FileCheck %s ; llvm-rc fails now on this sample because it is an invalid resource file ; script. We silence the error message and just analyze the output. Index: tools/llvm-cvtres/Opts.td =================================================================== --- tools/llvm-cvtres/Opts.td +++ tools/llvm-cvtres/Opts.td @@ -1,13 +1,11 @@ include "llvm/Option/OptParser.td" -// All the switches can be preceded by either '/' or '-'. - -def DEFINE : Joined<["/", "-"], "DEFINE:">, HelpText<"">, MetaVarName<"symbol">; -def FOLDDUPS : Flag<["/", "-"], "FOLDDUPS:">, HelpText<"">; -def MACHINE : Joined<["/", "-"], "MACHINE:">, HelpText<"">, MetaVarName<"{ARM|ARM64|EBC|IA64|X64|X86}">; -def NOLOGO : Flag<["/", "-"], "NOLOGO">, HelpText<"">; -def OUT : Joined<["/", "-"], "OUT:">, HelpText<"">, MetaVarName<"filename">; -def READONLY : Flag<["/", "-"], "READONLY">, HelpText<"">; -def VERBOSE : Flag<["/", "-"], "VERBOSE">, HelpText<"">; -def HELP : Flag<["/", "-"], "HELP">; -def H : Flag<["/", "-"], "H">, Alias; +def DEFINE : Joined<["/"], "DEFINE:">, HelpText<"">, MetaVarName<"symbol">; +def FOLDDUPS : Flag<["/"], "FOLDDUPS:">, HelpText<"">; +def MACHINE : Joined<["/"], "MACHINE:">, HelpText<"">, MetaVarName<"{ARM|ARM64|EBC|IA64|X64|X86}">; +def NOLOGO : Flag<["/"], "NOLOGO">, HelpText<"">; +def OUT : Joined<["/"], "OUT:">, HelpText<"">, MetaVarName<"filename">; +def READONLY : Flag<["/"], "READONLY">, HelpText<"">; +def VERBOSE : Flag<["/"], "VERBOSE">, HelpText<"">; +def HELP : Flag<["/"], "HELP">; +def H : Flag<["/"], "H">, Alias; Index: tools/llvm-objcopy/Object.h =================================================================== --- tools/llvm-objcopy/Object.h +++ tools/llvm-objcopy/Object.h @@ -366,7 +366,6 @@ const SectionBase *getStrTab() const { return SymbolNames; } const Symbol *getSymbolByIndex(uint32_t Index) const; void updateSymbols(function_ref Callable); - void removeSymbols(function_ref ToRemove); void removeSectionReferences(const SectionBase *Sec) override; void initialize(SectionTableRef SecTable) override; Index: tools/llvm-objcopy/Object.cpp =================================================================== --- tools/llvm-objcopy/Object.cpp +++ tools/llvm-objcopy/Object.cpp @@ -194,7 +194,12 @@ " cannot be removed because it is referenced by the symbol table " + this->Name); } - removeSymbols([Sec](const Symbol &Sym) { return Sym.DefinedIn == Sec; }); + auto Iter = + std::remove_if(std::begin(Symbols), std::end(Symbols), + [=](const SymPtr &Sym) { return Sym->DefinedIn == Sec; }); + Size -= (std::end(Symbols) - Iter) * this->EntrySize; + Symbols.erase(Iter, std::end(Symbols)); + assignIndices(); } void SymbolTableSection::updateSymbols(function_ref Callable) { @@ -206,15 +211,6 @@ assignIndices(); } -void SymbolTableSection::removeSymbols(function_ref ToRemove) { - Symbols.erase( - std::remove_if(std::begin(Symbols), std::end(Symbols), - [ToRemove](const SymPtr &Sym) { return ToRemove(*Sym); }), - std::end(Symbols)); - Size = Symbols.size() * EntrySize; - assignIndices(); -} - void SymbolTableSection::initialize(SectionTableRef SecTable) { Size = 0; setStrTab(SecTable.getSectionOfType( Index: tools/llvm-objcopy/Opts.td =================================================================== --- tools/llvm-objcopy/Opts.td +++ tools/llvm-objcopy/Opts.td @@ -72,9 +72,3 @@ HelpText<"Mark as weak">; def W : JoinedOrSeparate<["-"], "W">, Alias; -def weaken : Flag<["-", "--"], "weaken">, - HelpText<"Mark all global symbols as weak">; -def discard_all : Flag<["-", "--"], "discard-all">, - HelpText<"Remove all local symbols except file and section symbols">; -def x : Flag<["-"], "x">, - Alias; Index: tools/llvm-objcopy/llvm-objcopy.cpp =================================================================== --- tools/llvm-objcopy/llvm-objcopy.cpp +++ tools/llvm-objcopy/llvm-objcopy.cpp @@ -130,8 +130,6 @@ bool StripDWO; bool ExtractDWO; bool LocalizeHidden; - bool Weaken; - bool DiscardAll; }; using SectionPred = std::function; @@ -336,22 +334,10 @@ Sym.Binding == STB_GLOBAL) Sym.Binding = STB_WEAK; - if (Config.Weaken && Sym.Binding == STB_GLOBAL && - Sym.getShndx() != SHN_UNDEF) - Sym.Binding = STB_WEAK; - const auto I = Config.SymbolsToRename.find(Sym.Name); if (I != Config.SymbolsToRename.end()) Sym.Name = I->getValue(); }); - - Obj.SymbolTable->removeSymbols([&](const Symbol &Sym) { - if (Config.DiscardAll && Sym.Binding == STB_LOCAL && - Sym.getShndx() != SHN_UNDEF && Sym.Type != STT_FILE && - Sym.Type != STT_SECTION) - return true; - return false; - }); } } @@ -437,8 +423,6 @@ Config.StripNonAlloc = InputArgs.hasArg(OBJCOPY_strip_non_alloc); Config.ExtractDWO = InputArgs.hasArg(OBJCOPY_extract_dwo); Config.LocalizeHidden = InputArgs.hasArg(OBJCOPY_localize_hidden); - Config.Weaken = InputArgs.hasArg(OBJCOPY_weaken); - Config.DiscardAll = InputArgs.hasArg(OBJCOPY_discard_all); for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbol)) Config.SymbolsToLocalize.push_back(Arg->getValue()); for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbol)) Index: tools/llvm-rc/Opts.td =================================================================== --- tools/llvm-rc/Opts.td +++ tools/llvm-rc/Opts.td @@ -35,9 +35,6 @@ def DRY_RUN : Flag<[ "/", "-" ], "dry-run">, HelpText<"Don't compile the input; only try to parse it.">; -def CODEPAGE : JoinedOrSeparate<[ "/", "-" ], "C">, - HelpText<"Set the codepage used for input strings.">; - // Unused switches (at least for now). These will stay unimplemented // in an early stage of development and can be ignored. However, we need to // parse them in order to preserve the compatibility with the original tool. @@ -47,6 +44,7 @@ def SL : Flag<[ "/", "-" ], "SL">; // (Codepages support.) +def C : Flag<[ "/", "-" ], "C">; def W : Flag<[ "/", "-" ], "W">; // (Support of MUI and similar.) Index: tools/llvm-rc/ResourceFileWriter.h =================================================================== --- tools/llvm-rc/ResourceFileWriter.h +++ tools/llvm-rc/ResourceFileWriter.h @@ -25,25 +25,15 @@ namespace rc { -enum CodePage { - CpAcp = 0, // The current used codepage. Since there's no such - // notion in LLVM what codepage it actually means, - // this only allows ASCII. - CpWin1252 = 1252, // A codepage where most 8 bit values correspond to - // unicode code points with the same value. - CpUtf8 = 65001, // UTF-8. -}; - -struct WriterParams { +struct SearchParams { std::vector Include; // Additional folders to search for files. std::vector NoInclude; // Folders to exclude from file search. StringRef InputFilePath; // The full path of the input file. - int CodePage = CpAcp; // The codepage for interpreting characters. }; class ResourceFileWriter : public Visitor { public: - ResourceFileWriter(const WriterParams &Params, + ResourceFileWriter(const SearchParams &Params, std::unique_ptr Stream) : Params(Params), FS(std::move(Stream)), IconCursorID(1) { assert(FS && "Output stream needs to be provided to the serializator"); @@ -156,7 +146,7 @@ Error writeVersionInfoBlock(const VersionInfoBlock &); Error writeVersionInfoValue(const VersionInfoValue &); - const WriterParams &Params; + const SearchParams &Params; // Output stream handling. std::unique_ptr FS; Index: tools/llvm-rc/ResourceFileWriter.cpp =================================================================== --- tools/llvm-rc/ResourceFileWriter.cpp +++ tools/llvm-rc/ResourceFileWriter.cpp @@ -110,18 +110,6 @@ return true; } -static UTF16 cp1252ToUnicode(unsigned char C) { - static const UTF16 Map80[] = { - 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, - 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f, - 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, - 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178, - }; - if (C >= 0x80 && C <= 0x9F) - return Map80[C - 0x80]; - return C; -} - // Describes a way to handle '\0' characters when processing the string. // rc.exe tool sometimes behaves in a weird way in postprocessing. // If the string to be output is equivalent to a C-string (e.g. in MENU @@ -144,26 +132,10 @@ // * Replace the escape sequences with their processed version. // For identifiers, this is no-op. static Error processString(StringRef Str, NullHandlingMethod NullHandler, - bool &IsLongString, SmallVectorImpl &Result, - int CodePage) { + bool &IsLongString, SmallVectorImpl &Result) { bool IsString = stripQuotes(Str, IsLongString); SmallVector Chars; - - // Convert the input bytes according to the chosen codepage. - if (CodePage == CpUtf8) { - convertUTF8ToUTF16String(Str, Chars); - } else if (CodePage == CpWin1252) { - for (char C : Str) - Chars.push_back(cp1252ToUnicode((unsigned char)C)); - } else { - // For other, unknown codepages, only allow plain ASCII input. - for (char C : Str) { - if ((unsigned char)C > 0x7F) - return createError("Non-ASCII 8-bit codepoint (" + Twine(C) + - ") can't be interpreted in the current codepage"); - Chars.push_back((unsigned char)C); - } - } + convertUTF8ToUTF16String(Str, Chars); if (!IsString) { // It's an identifier if it's not a string. Make all characters uppercase. @@ -185,35 +157,21 @@ if (Char > 0xFF) return createError("Non-8-bit codepoint (" + Twine(Char) + ") can't occur in a user-defined narrow string"); - } - } - Result.push_back(Char); - return Error::success(); - }; - auto AddEscapedChar = [AddRes, IsLongString, CodePage](UTF16 Char) -> Error { - if (!IsLongString) { - // Escaped chars in narrow strings have to be interpreted according to - // the chosen code page. - if (Char > 0xFF) - return createError("Non-8-bit escaped char (" + Twine(Char) + - ") can't occur in narrow string"); - if (CodePage == CpUtf8) { - if (Char >= 0x80) - return createError("Unable to interpret single byte (" + Twine(Char) + - ") as UTF-8"); - } else if (CodePage == CpWin1252) { - Char = cp1252ToUnicode(Char); } else { - // Unknown/unsupported codepage, only allow ASCII input. - if (Char > 0x7F) + // In case of narrow non-user strings, Windows RC converts + // [0x80, 0xFF] chars according to the current codepage. + // There is no 'codepage' concept settled in every supported platform, + // so we should reject such inputs. + if (Char > 0x7F && Char <= 0xFF) return createError("Non-ASCII 8-bit codepoint (" + Twine(Char) + ") can't " "occur in a non-Unicode string"); } } - return AddRes(Char); + Result.push_back(Char); + return Error::success(); }; while (Pos < Chars.size()) { @@ -265,7 +223,7 @@ --RemainingChars; } - RETURN_IF_ERROR(AddEscapedChar(ReadInt)); + RETURN_IF_ERROR(AddRes(ReadInt)); continue; } @@ -282,7 +240,7 @@ ++Pos; } - RETURN_IF_ERROR(AddEscapedChar(ReadInt)); + RETURN_IF_ERROR(AddRes(ReadInt)); continue; } @@ -370,8 +328,7 @@ SmallVector ProcessedString; bool IsLongString; RETURN_IF_ERROR(processString(Str, NullHandlingMethod::CutAtNull, - IsLongString, ProcessedString, - Params.CodePage)); + IsLongString, ProcessedString)); for (auto Ch : ProcessedString) writeInt(Ch); if (WriteTerminator) @@ -1185,7 +1142,6 @@ static bool classof(const RCResource *Res) { return Res->getKind() == RkStringTableBundle; } - Twine getResourceTypeName() const override { return "STRINGTABLE"; } }; Error ResourceFileWriter::visitStringTableBundle(const RCResource *Res) { @@ -1212,7 +1168,7 @@ SmallVector Data; RETURN_IF_ERROR(processString(Res->Bundle.Data[ID].getValueOr(StringRef()), NullHandlingMethod::CutAtDoubleNull, - IsLongString, Data, Params.CodePage)); + IsLongString, Data)); if (AppendNull && Res->Bundle.Data[ID]) Data.push_back('\0'); RETURN_IF_ERROR( @@ -1259,9 +1215,9 @@ SmallVector ProcessedString; bool IsLongString; - RETURN_IF_ERROR( - processString(Elem.getString(), NullHandlingMethod::UserResource, - IsLongString, ProcessedString, Params.CodePage)); + RETURN_IF_ERROR(processString(Elem.getString(), + NullHandlingMethod::UserResource, + IsLongString, ProcessedString)); for (auto Ch : ProcessedString) { if (IsLongString) { Index: tools/llvm-rc/llvm-rc.cpp =================================================================== --- tools/llvm-rc/llvm-rc.cpp +++ tools/llvm-rc/llvm-rc.cpp @@ -24,7 +24,6 @@ #include "llvm/Support/InitLLVM.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/Path.h" #include "llvm/Support/PrettyStackTrace.h" #include "llvm/Support/Process.h" #include "llvm/Support/Signals.h" @@ -130,43 +129,21 @@ } } - WriterParams Params; + SearchParams Params; SmallString<128> InputFile(InArgsInfo[0]); llvm::sys::fs::make_absolute(InputFile); Params.InputFilePath = InputFile; Params.Include = InputArgs.getAllArgValues(OPT_INCLUDE); Params.NoInclude = InputArgs.getAllArgValues(OPT_NOINCLUDE); - if (InputArgs.hasArg(OPT_CODEPAGE)) { - if (InputArgs.getLastArgValue(OPT_CODEPAGE) - .getAsInteger(10, Params.CodePage)) - fatalError("Invalid code page: " + - InputArgs.getLastArgValue(OPT_CODEPAGE)); - switch (Params.CodePage) { - case CpAcp: - case CpWin1252: - case CpUtf8: - break; - default: - fatalError( - "Unsupported code page, only 0, 1252 and 65001 are supported!"); - } - } - std::unique_ptr Visitor; bool IsDryRun = InputArgs.hasArg(OPT_DRY_RUN); if (!IsDryRun) { auto OutArgsInfo = InputArgs.getAllArgValues(OPT_FILEOUT); - if (OutArgsInfo.empty()) { - SmallString<128> OutputFile = InputFile; - llvm::sys::path::replace_extension(OutputFile, "res"); - OutArgsInfo.push_back(OutputFile.str()); - } - if (OutArgsInfo.size() != 1) fatalError( - "No more than one output file should be provided (using /FO flag)."); + "Exactly one output file should be provided (using /FO flag)."); std::error_code EC; auto FOut = Index: tools/llvm-xray/CMakeLists.txt =================================================================== --- tools/llvm-xray/CMakeLists.txt +++ tools/llvm-xray/CMakeLists.txt @@ -4,8 +4,7 @@ Object Support Symbolize - XRay - ) + XRay) set(LLVM_XRAY_TOOLS func-id-helper.cpp @@ -16,10 +15,6 @@ xray-graph.cpp xray-graph-diff.cpp xray-stacks.cpp - xray-registry.cpp - ) + xray-registry.cpp) -add_llvm_tool(llvm-xray - llvm-xray.cpp - ${LLVM_XRAY_TOOLS} - ) +add_llvm_tool(llvm-xray llvm-xray.cpp ${LLVM_XRAY_TOOLS}) Index: unittests/Passes/CMakeLists.txt =================================================================== --- unittests/Passes/CMakeLists.txt +++ unittests/Passes/CMakeLists.txt @@ -20,7 +20,6 @@ PROPERTIES PREFIX "" SUFFIX ".so" ) -set_target_properties(TestPlugin PROPERTIES FOLDER "Tests") if (WIN32 OR CYGWIN OR LLVM_EXPORT_SYMBOLS_FOR_PLUGINS) llvm_map_components_to_libnames(LLVM_DEPS ${LLVM_LINK_COMPONENTS}) Index: utils/TableGen/GlobalISelEmitter.cpp =================================================================== --- utils/TableGen/GlobalISelEmitter.cpp +++ utils/TableGen/GlobalISelEmitter.cpp @@ -407,8 +407,6 @@ unsigned size() const { return NumElements; } }; -class Matcher; - /// Holds the contents of a generated MatchTable to enable formatting and the /// necessary index tracking needed to support GIM_Try. class MatchTable { @@ -421,11 +419,10 @@ /// The currently defined labels. DenseMap LabelMap; /// Tracks the sum of MatchTableRecord::NumElements as the table is built. - unsigned CurrentSize = 0; + unsigned CurrentSize; + /// A unique identifier for a MatchTable label. - unsigned CurrentLabelID = 0; - /// Determines if the table should be instrumented for rule coverage tracking. - bool IsWithCoverage; + static unsigned CurrentLabelID; public: static MatchTableRecord LineBreak; @@ -468,12 +465,7 @@ MatchTableRecord::MTRF_CommaFollows); } - static MatchTable buildTable(ArrayRef Rules, bool WithCoverage); - - MatchTable(bool WithCoverage, unsigned ID = 0) - : ID(ID), IsWithCoverage(WithCoverage) {} - - bool isWithCoverage() const { return IsWithCoverage; } + MatchTable(unsigned ID) : ID(ID), CurrentSize(0) {} void push_back(const MatchTableRecord &Value) { if (Value.Flags & MatchTableRecord::MTRF_Label) @@ -482,7 +474,7 @@ CurrentSize += Value.size(); } - unsigned allocateLabelID() { return CurrentLabelID++; } + unsigned allocateLabelID() const { return CurrentLabelID++; } void defineLabel(unsigned LabelID) { LabelMap.insert(std::make_pair(LabelID, CurrentSize)); @@ -527,6 +519,8 @@ } }; +unsigned MatchTable::CurrentLabelID = 0; + MatchTableRecord MatchTable::LineBreak = { None, "" /* Emit String */, 0 /* Elements */, MatchTableRecord::MTRF_LineBreakFollows}; @@ -583,15 +577,6 @@ virtual std::unique_ptr forgetFirstCondition() = 0; }; -MatchTable MatchTable::buildTable(ArrayRef Rules, - bool WithCoverage) { - MatchTable Table(WithCoverage); - for (Matcher *Rule : Rules) - Rule->emit(Table); - - return Table << MatchTable::Opcode("GIM_Reject") << MatchTable::LineBreak; -} - class GroupMatcher : public Matcher { SmallVector, 8> Conditions; SmallVector Rules; @@ -2498,7 +2483,7 @@ for (const auto &MA : Actions) MA->emitActionOpcodes(Table, *this); - if (Table.isWithCoverage()) + if (GenerateCoverage) Table << MatchTable::Opcode("GIR_Coverage") << MatchTable::IntValue(RuleID) << MatchTable::LineBreak; @@ -2701,11 +2686,8 @@ /// # predicate C /// \endverbatim std::vector optimizeRules( - ArrayRef Rules, + const std::vector &Rules, std::vector> &StorageGroupMatcher); - - MatchTable buildMatchTable(MutableArrayRef Rules, bool Optimize, - bool WithCoverage); }; void GlobalISelEmitter::gatherNodeEquivs() { @@ -3662,7 +3644,7 @@ } std::vector GlobalISelEmitter::optimizeRules( - ArrayRef Rules, + const std::vector &Rules, std::vector> &StorageGroupMatcher) { std::vector OptRules; // Start with a stupid grouping for now. @@ -3693,23 +3675,6 @@ return OptRules; } -MatchTable -GlobalISelEmitter::buildMatchTable(MutableArrayRef Rules, - bool Optimize, bool WithCoverage) { - std::vector InputRules; - for (Matcher &Rule : Rules) - InputRules.push_back(&Rule); - - if (!Optimize) - return MatchTable::buildTable(InputRules, WithCoverage); - - std::vector> StorageGroupMatcher; - std::vector OptRules = - optimizeRules(InputRules, StorageGroupMatcher); - - return MatchTable::buildTable(OptRules, WithCoverage); -} - void GlobalISelEmitter::run(raw_ostream &OS) { if (!UseCoverageFile.empty()) { RuleCoverage = CodeGenCoverage(); @@ -3802,13 +3767,12 @@ << "InstructionSelector::ComplexMatcherMemFn ComplexPredicateFns[];\n" << " static " << Target.getName() << "InstructionSelector::CustomRendererFn CustomRenderers[];\n" - << " bool testImmPredicate_I64(unsigned PredicateID, int64_t Imm) const " + << "bool testImmPredicate_I64(unsigned PredicateID, int64_t Imm) const " "override;\n" - << " bool testImmPredicate_APInt(unsigned PredicateID, const APInt &Imm) " + << "bool testImmPredicate_APInt(unsigned PredicateID, const APInt &Imm) " "const override;\n" - << " bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat " + << "bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat " "&Imm) const override;\n" - << " const int64_t *getMatchTable() const override;\n" << "#endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL\n\n"; OS << "#ifdef GET_GLOBALISEL_TEMPORARIES_INIT\n" @@ -3960,6 +3924,20 @@ << ", // " << Record->getName() << "\n"; OS << "};\n\n"; + OS << "bool " << Target.getName() + << "InstructionSelector::selectImpl(MachineInstr &I, CodeGenCoverage " + "&CoverageInfo) const {\n" + << " MachineFunction &MF = *I.getParent()->getParent();\n" + << " MachineRegisterInfo &MRI = MF.getRegInfo();\n" + << " // FIXME: This should be computed on a per-function basis rather " + "than per-insn.\n" + << " AvailableFunctionFeatures = computeAvailableFunctionFeatures(&STI, " + "&MF);\n" + << " const PredicateBitset AvailableFeatures = getAvailableFeatures();\n" + << " NewMIVector OutMIs;\n" + << " State.MIs.clear();\n" + << " State.MIs.push_back(&I);\n\n"; + std::stable_sort(Rules.begin(), Rules.end(), [&](const RuleMatcher &A, const RuleMatcher &B) { int ScoreA = RuleMatcherScores[A.getRuleID()]; @@ -3976,37 +3954,31 @@ } return false; }); + std::vector> StorageGroupMatcher; - OS << "bool " << Target.getName() - << "InstructionSelector::selectImpl(MachineInstr &I, CodeGenCoverage " - "&CoverageInfo) const {\n" - << " MachineFunction &MF = *I.getParent()->getParent();\n" - << " MachineRegisterInfo &MRI = MF.getRegInfo();\n" - << " // FIXME: This should be computed on a per-function basis rather " - "than per-insn.\n" - << " AvailableFunctionFeatures = computeAvailableFunctionFeatures(&STI, " - "&MF);\n" - << " const PredicateBitset AvailableFeatures = getAvailableFeatures();\n" - << " NewMIVector OutMIs;\n" - << " State.MIs.clear();\n" - << " State.MIs.push_back(&I);\n\n" - << " if (executeMatchTable(*this, OutMIs, State, ISelInfo" - << ", getMatchTable(), TII, MRI, TRI, RBI, AvailableFeatures" - << ", CoverageInfo)) {\n" - << " return true;\n" - << " }\n\n" - << " return false;\n" - << "}\n\n"; + std::vector InputRules; + for (Matcher &Rule : Rules) + InputRules.push_back(&Rule); - const MatchTable Table = - buildMatchTable(Rules, OptimizeMatchTable, GenerateCoverage); - OS << "const int64_t *" << Target.getName() - << "InstructionSelector::getMatchTable() const {\n"; + std::vector OptRules = + OptimizeMatchTable ? optimizeRules(InputRules, StorageGroupMatcher) + : InputRules; + + MatchTable Table(0); + for (Matcher *Rule : OptRules) + Rule->emit(Table); + + Table << MatchTable::Opcode("GIM_Reject") << MatchTable::LineBreak; Table.emitDeclaration(OS); - OS << " return "; + OS << " if (executeMatchTable(*this, OutMIs, State, ISelInfo, "; Table.emitUse(OS); - OS << ";\n}\n"; - OS << "#endif // ifdef GET_GLOBALISEL_IMPL\n"; + OS << ", TII, MRI, TRI, RBI, AvailableFeatures, CoverageInfo)) {\n" + << " return true;\n" + << " }\n\n"; + + OS << " return false;\n" + << "}\n" + << "#endif // ifdef GET_GLOBALISEL_IMPL\n"; OS << "#ifdef GET_GLOBALISEL_PREDICATES_DECL\n" << "PredicateBitset AvailableModuleFeatures;\n" Index: utils/lit/lit/TestingConfig.py =================================================================== --- utils/lit/lit/TestingConfig.py +++ utils/lit/lit/TestingConfig.py @@ -171,9 +171,3 @@ def __str__(self): return self.substitution - def __len__(self): - return len(self.substitution) - - def __getitem__(self, item): - return self.substitution.__getitem__(item) -