Index: include/llvm/CodeGen/AsmPrinter.h =================================================================== --- include/llvm/CodeGen/AsmPrinter.h +++ include/llvm/CodeGen/AsmPrinter.h @@ -22,10 +22,12 @@ #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/DwarfStringPoolEntry.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Target/TargetSubtargetInfo.h" #include #include #include @@ -112,6 +114,11 @@ typedef std::pair GOTEquivUsePair; MapVector GlobalGOTEquivs; + /// The taget schedule model. + TargetSchedModel SchedModel; + /// Enable print [latency:throughput] in output + bool EnablePrintSchedInfo = false; + private: MCSymbol *CurrentFnBegin = nullptr; MCSymbol *CurrentFnEnd = nullptr; Index: include/llvm/CodeGen/TargetSchedule.h =================================================================== --- include/llvm/CodeGen/TargetSchedule.h +++ include/llvm/CodeGen/TargetSchedule.h @@ -189,6 +189,9 @@ /// This is typically one cycle. unsigned computeOutputLatency(const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *DepMI) const; + /// \brief Compute the reciprocal throughput of the given instruction. + Optional computeInstrRThroughput(const MachineInstr *MI) const; + Optional computeInstrRThroughput(unsigned Opcode) const; }; } // end namespace llvm Index: include/llvm/MC/MCObjectStreamer.h =================================================================== --- include/llvm/MC/MCObjectStreamer.h +++ include/llvm/MC/MCObjectStreamer.h @@ -97,7 +97,8 @@ void EmitSLEB128Value(const MCExpr *Value) override; void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override; void ChangeSection(MCSection *Section, const MCExpr *Subsection) override; - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo& STI) override; + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool PrintSchedInfo = false) override; /// \brief Emit an instruction to a special fragment, because this instruction /// can change its size during relaxation. Index: include/llvm/MC/MCStreamer.h =================================================================== --- include/llvm/MC/MCStreamer.h +++ include/llvm/MC/MCStreamer.h @@ -836,7 +836,8 @@ } /// \brief Emit the given \p Instruction into the current section. - virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI); + virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool PrintSchedInfo = false); /// \brief Set the bundle alignment mode from now on in the section. /// The argument is the power of 2 to which the alignment is set. The Index: include/llvm/MC/MCSubtargetInfo.h =================================================================== --- include/llvm/MC/MCSubtargetInfo.h +++ include/llvm/MC/MCSubtargetInfo.h @@ -26,6 +26,8 @@ #include namespace llvm { +class MachineInstr; +class MCInst; //===----------------------------------------------------------------------===// /// @@ -167,6 +169,14 @@ auto Found = std::lower_bound(ProcDesc.begin(), ProcDesc.end(), CPU); return Found != ProcDesc.end() && StringRef(Found->Key) == CPU; } + + /// Returns string representation of scheduler comment + virtual std::string getSchedInfoStr(const MachineInstr &MI) const { + return std::string(); + } + virtual std::string getSchedInfoStr(const MCInst &MCI) const { + return std::string(); + } }; } // end namespace llvm Index: include/llvm/Target/TargetSubtargetInfo.h =================================================================== --- include/llvm/Target/TargetSubtargetInfo.h +++ include/llvm/Target/TargetSubtargetInfo.h @@ -18,8 +18,9 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/PBQPRAConstraint.h" -#include "llvm/CodeGen/SchedulerRegistry.h" #include "llvm/CodeGen/ScheduleDAGMutation.h" +#include "llvm/CodeGen/SchedulerRegistry.h" +#include "llvm/MC/MCInst.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/CodeGen.h" #include @@ -143,6 +144,9 @@ /// TargetLowering preference). It does not yet disable the postRA scheduler. virtual bool enableMachineScheduler() const; + /// \brief Enable print [latency:throughput] comment in output .S file + virtual bool supportPrintSchedInfo() const { return false; } + /// \brief True if the machine scheduler should disable the TLI preference /// for preRA scheduling with the source level scheduler. virtual bool enableMachineSchedDefaultSched() const { return true; } Index: lib/CodeGen/AsmPrinter/AsmPrinter.cpp =================================================================== --- lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/AsmPrinter.h" #include "AsmPrinterHandler.h" #include "CodeViewDebug.h" #include "DwarfDebug.h" @@ -19,17 +20,16 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/CodeGen/Analysis.h" -#include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" #include "llvm/CodeGen/GCStrategy.h" @@ -46,6 +46,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -86,9 +87,9 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/Timer.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" @@ -122,6 +123,10 @@ STATISTIC(EmittedInsts, "Number of machine instrs printed"); +static cl::opt + PrintSchedule("print-schedule", cl::Hidden, cl::init(false), + cl::desc("Print [latency:throughput] in .s output")); + char AsmPrinter::ID = 0; typedef DenseMap> gcp_map_type; @@ -719,7 +724,8 @@ } /// emitComments - Pretty-print comments for instructions. -static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { +static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS, + AsmPrinter *AP) { const MachineFunction *MF = MI.getParent()->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); @@ -727,6 +733,7 @@ int FI; const MachineFrameInfo &MFI = MF->getFrameInfo(); + bool Commented = false; // We assume a single instruction only has a spill or reload, not // both. @@ -734,24 +741,37 @@ if (TII->isLoadFromStackSlotPostFE(MI, FI)) { if (MFI.isSpillSlotObjectIndex(FI)) { MMO = *MI.memoperands_begin(); - CommentOS << MMO->getSize() << "-byte Reload\n"; + CommentOS << MMO->getSize() << "-byte Reload"; + Commented = true; } } else if (TII->hasLoadFromStackSlot(MI, MMO, FI)) { - if (MFI.isSpillSlotObjectIndex(FI)) - CommentOS << MMO->getSize() << "-byte Folded Reload\n"; + if (MFI.isSpillSlotObjectIndex(FI)) { + CommentOS << MMO->getSize() << "-byte Folded Reload"; + Commented = true; + } } else if (TII->isStoreToStackSlotPostFE(MI, FI)) { if (MFI.isSpillSlotObjectIndex(FI)) { MMO = *MI.memoperands_begin(); - CommentOS << MMO->getSize() << "-byte Spill\n"; + CommentOS << MMO->getSize() << "-byte Spill"; + Commented = true; } } else if (TII->hasStoreToStackSlot(MI, MMO, FI)) { - if (MFI.isSpillSlotObjectIndex(FI)) - CommentOS << MMO->getSize() << "-byte Folded Spill\n"; + if (MFI.isSpillSlotObjectIndex(FI)) { + CommentOS << MMO->getSize() << "-byte Folded Spill"; + Commented = true; + } } // Check for spill-induced copies - if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) - CommentOS << " Reload Reuse\n"; + if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) { + Commented = true; + CommentOS << " Reload Reuse"; + } + + if (Commented && AP->EnablePrintSchedInfo) + CommentOS << " " << MF->getSubtarget().getSchedInfoStr(MI) << "\n"; + else if (Commented) + CommentOS << "\n"; } /// emitImplicitDef - This method emits the specified machine instruction @@ -965,7 +985,7 @@ } if (isVerbose()) - emitComments(MI, OutStreamer->GetCommentOS()); + emitComments(MI, OutStreamer->GetCommentOS(), this); switch (MI.getOpcode()) { case TargetOpcode::CFI_INSTRUCTION: @@ -1382,6 +1402,13 @@ ORE = &getAnalysis().getORE(); if (isVerbose()) LI = &getAnalysis(); + + const TargetSubtargetInfo &STI = MF.getSubtarget(); + EnablePrintSchedInfo = PrintSchedule.getNumOccurrences() + ? PrintSchedule + : STI.supportPrintSchedInfo(); + if (EnablePrintSchedInfo) + SchedModel.init(STI.getSchedModel(), &STI, STI.getInstrInfo()); } namespace { Index: lib/CodeGen/TargetSchedule.cpp =================================================================== --- lib/CodeGen/TargetSchedule.cpp +++ lib/CodeGen/TargetSchedule.cpp @@ -277,7 +277,10 @@ if (SCDesc->isValid() && !SCDesc->isVariant()) return computeInstrLatency(*SCDesc); - llvm_unreachable("No MI sched latency"); + if (SCDesc->isValid()) + llvm_unreachable("No MI sched latency: SCDesc->isVariant()"); + // TODO: some opcodes don't have valid MCSchedClassDesc? + return 0; } unsigned @@ -331,3 +334,68 @@ } return 0; } + +static Optional +getRTroughputFromItineraries(unsigned schedClass, + const InstrItineraryData *IID){ + double Unknown = std::numeric_limits::infinity(); + double Throughput = Unknown; + + for (const InstrStage *IS = IID->beginStage(schedClass), + *E = IID->endStage(schedClass); + IS != E; ++IS) { + unsigned Cycles = IS->getCycles(); + if (!Cycles) + continue; + Throughput = + std::min(Throughput, countPopulation(IS->getUnits()) * 1.0 / Cycles); + } + // We need reciprocal throughput that's why we return such value. + return 1 / Throughput; +} + +static Optional +getRTroughputFromInstrSchedModel(const MCSchedClassDesc *SCDesc, + const TargetSubtargetInfo *STI, + const MCSchedModel &SchedModel) { + double Unknown = std::numeric_limits::infinity(); + double Throughput = Unknown; + + for (const MCWriteProcResEntry *WPR = STI->getWriteProcResBegin(SCDesc), + *WEnd = STI->getWriteProcResEnd(SCDesc); + WPR != WEnd; ++WPR) { + unsigned Cycles = WPR->Cycles; + if (!Cycles) + return Optional(); + + unsigned NumUnits = + SchedModel.getProcResource(WPR->ProcResourceIdx)->NumUnits; + Throughput = std::min(Throughput, NumUnits * 1.0 / Cycles); + } + // We need reciprocal throughput that's why we return such value. + return 1 / Throughput; +} + +Optional +TargetSchedModel::computeInstrRThroughput(const MachineInstr *MI) const { + if (hasInstrItineraries()) + return getRTroughputFromItineraries(MI->getDesc().getSchedClass(), + getInstrItineraries()); + if (hasInstrSchedModel()) + return getRTroughputFromInstrSchedModel(resolveSchedClass(MI), STI, + SchedModel); + return Optional(); +} + +Optional +TargetSchedModel::computeInstrRThroughput(unsigned Opcode) const { + unsigned SchedClass = TII->get(Opcode).getSchedClass(); + if (hasInstrItineraries()) + return getRTroughputFromItineraries(SchedClass, getInstrItineraries()); + if (hasInstrSchedModel()) { + const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass); + if (SCDesc->isValid() && !SCDesc->isVariant()) + return getRTroughputFromInstrSchedModel(SCDesc, STI, SchedModel); + } + return Optional(); +} Index: lib/MC/MCAsmStreamer.cpp =================================================================== --- lib/MC/MCAsmStreamer.cpp +++ lib/MC/MCAsmStreamer.cpp @@ -103,7 +103,8 @@ void AddComment(const Twine &T, bool EOL = true) override; /// AddEncodingComment - Add a comment showing the encoding of an instruction. - void AddEncodingComment(const MCInst &Inst, const MCSubtargetInfo &); + void AddEncodingComment(const MCInst &Inst, const MCSubtargetInfo &, + bool PrintSchedInfo); /// GetCommentOS - Return a raw_ostream that comments can be written to. /// Unlike AddComment, you are required to terminate comments with \n if you @@ -278,7 +279,8 @@ void EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except) override; void EmitWinEHHandlerData() override; - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool PrintSchedInfo) override; void EmitBundleAlignMode(unsigned AlignPow2) override; void EmitBundleLock(bool AlignToEnd) override; @@ -1504,7 +1506,8 @@ } void MCAsmStreamer::AddEncodingComment(const MCInst &Inst, - const MCSubtargetInfo &STI) { + const MCSubtargetInfo &STI, + bool PrintSchedInfo) { raw_ostream &OS = GetCommentOS(); SmallString<256> Code; SmallVector Fixups; @@ -1577,7 +1580,9 @@ } } } - OS << "]\n"; + OS << "]"; + if (Fixups.size() || !PrintSchedInfo) + OS << "\n"; for (unsigned i = 0, e = Fixups.size(); i != e; ++i) { MCFixup &F = Fixups[i]; @@ -1588,16 +1593,19 @@ } void MCAsmStreamer::EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) { + const MCSubtargetInfo &STI, + bool PrintSchedInfo) { assert(getCurrentSectionOnly() && "Cannot emit contents before setting section!"); // Show the encoding in a comment if we have a code emitter. if (Emitter) - AddEncodingComment(Inst, STI); + AddEncodingComment(Inst, STI, PrintSchedInfo); // Show the MCInst if enabled. if (ShowInst) { + if (PrintSchedInfo) + GetCommentOS() << "\n"; Inst.dump_pretty(GetCommentOS(), InstPrinter.get(), "\n "); GetCommentOS() << "\n"; } @@ -1607,6 +1615,16 @@ else InstPrinter->printInst(&Inst, OS, "", STI); + if (PrintSchedInfo) { + std::string SI = STI.getSchedInfoStr(Inst); + if (!SI.empty()) + GetCommentOS() << SI; + } + + StringRef Comments = CommentToEmit; + if (Comments.size() && Comments.back() != '\n') + GetCommentOS() << "\n"; + EmitEOL(); } Index: lib/MC/MCObjectStreamer.cpp =================================================================== --- lib/MC/MCObjectStreamer.cpp +++ lib/MC/MCObjectStreamer.cpp @@ -228,7 +228,7 @@ } void MCObjectStreamer::EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) { + const MCSubtargetInfo &STI, bool) { MCStreamer::EmitInstruction(Inst, STI); MCSection *Sec = getCurrentSectionOnly(); Index: lib/MC/MCStreamer.cpp =================================================================== --- lib/MC/MCStreamer.cpp +++ lib/MC/MCStreamer.cpp @@ -777,8 +777,8 @@ } } -void MCStreamer::EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) { +void MCStreamer::EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool PrintSchedInfo) { // Scan for values. for (unsigned i = Inst.getNumOperands(); i--;) if (Inst.getOperand(i).isExpr()) Index: lib/Object/RecordStreamer.h =================================================================== --- lib/Object/RecordStreamer.h +++ lib/Object/RecordStreamer.h @@ -34,7 +34,8 @@ const_iterator begin(); const_iterator end(); RecordStreamer(MCContext &Context); - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool) override; void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override; void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override; bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override; Index: lib/Object/RecordStreamer.cpp =================================================================== --- lib/Object/RecordStreamer.cpp +++ lib/Object/RecordStreamer.cpp @@ -78,8 +78,8 @@ RecordStreamer::RecordStreamer(MCContext &Context) : MCStreamer(Context) {} void RecordStreamer::EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) { - MCStreamer::EmitInstruction(Inst, STI); + const MCSubtargetInfo &STI, bool B) { + MCStreamer::EmitInstruction(Inst, STI, B); } void RecordStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) { Index: lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp =================================================================== --- lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -102,8 +102,8 @@ /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to add the appropriate mapping symbol if /// necessary. - void EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) override { + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool B) override { EmitA64MappingSymbol(); MCELFStreamer::EmitInstruction(Inst, STI); } Index: lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp =================================================================== --- lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -476,8 +476,8 @@ /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to add the appropriate mapping symbol if /// necessary. - void EmitInstruction(const MCInst& Inst, - const MCSubtargetInfo &STI) override { + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool) override { if (IsThumb) EmitThumbMappingSymbol(); else Index: lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h =================================================================== --- lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h +++ lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h @@ -34,7 +34,8 @@ MCELFStreamer(Context, TAB, OS, Emitter), MCII (createHexagonMCInstrInfo()) {} - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool B) override; void EmitSymbol(const MCInst &Inst); void HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment, Index: lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp =================================================================== --- lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp +++ lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -44,7 +44,7 @@ cl::init(8)); void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCB, - const MCSubtargetInfo &STI) { + const MCSubtargetInfo &STI, bool) { assert(MCB.getOpcode() == Hexagon::BUNDLE); assert(HexagonMCInstrInfo::bundleSize(MCB) <= HEXAGON_PACKET_SIZE); assert(HexagonMCInstrInfo::bundleSize(MCB) > 0); Index: lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h =================================================================== --- lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h +++ lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h @@ -45,7 +45,8 @@ /// \p Inst is actually emitted. For example, we can inspect the operands and /// gather sufficient information that allows us to reason about the register /// usage for the translation unit. - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool B = false) override; /// Overriding this function allows us to record all labels that should be /// marked as microMIPS. Based on this data marking is done in Index: lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp =================================================================== --- lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -20,7 +20,7 @@ using namespace llvm; void MipsELFStreamer::EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) { + const MCSubtargetInfo &STI, bool) { MCELFStreamer::EmitInstruction(Inst, STI); MCContext &Context = getContext(); Index: lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp =================================================================== --- lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp +++ lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp @@ -139,8 +139,8 @@ public: /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to mask dangerous instructions. - void EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) override { + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool) override { // Sandbox indirect jumps. if (isIndirectJump(Inst)) { if (PendingCall) Index: lib/Target/X86/InstPrinter/X86InstComments.h =================================================================== --- lib/Target/X86/InstPrinter/X86InstComments.h +++ lib/Target/X86/InstPrinter/X86InstComments.h @@ -23,6 +23,7 @@ class MCInst; class raw_ostream; + class AsmPrinter; bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, const char *(*getRegName)(unsigned)); } Index: lib/Target/X86/InstPrinter/X86InstComments.cpp =================================================================== --- lib/Target/X86/InstPrinter/X86InstComments.cpp +++ lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -15,8 +15,9 @@ #include "X86InstComments.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "Utils/X86ShuffleDecode.h" -#include "llvm/MC/MCInst.h" +#include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineValueType.h" +#include "llvm/MC/MCInst.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -1189,8 +1190,8 @@ OS << ']'; --i; // For loop increments element #. } - //MI->print(OS, 0); - OS << "\n"; + + // OS << "\n"; // We successfully added a comment to this instruction. return true; Index: lib/Target/X86/X86MCInstLower.cpp =================================================================== --- lib/Target/X86/X86MCInstLower.cpp +++ lib/Target/X86/X86MCInstLower.cpp @@ -102,7 +102,7 @@ } void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) { - OutStreamer->EmitInstruction(Inst, getSubtargetInfo()); + OutStreamer->EmitInstruction(Inst, getSubtargetInfo(), EnablePrintSchedInfo); SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get()); } @@ -1529,7 +1529,8 @@ SmallVector Mask; DecodePSHUFBMask(C, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask), + !EnablePrintSchedInfo); } break; } @@ -1600,7 +1601,8 @@ SmallVector Mask; DecodeVPERMILPMask(C, ElSize, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask), + !EnablePrintSchedInfo); } break; } @@ -1630,7 +1632,8 @@ SmallVector Mask; DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask), + !EnablePrintSchedInfo); } break; } @@ -1646,7 +1649,8 @@ SmallVector Mask; DecodeVPPERMMask(C, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask), + !EnablePrintSchedInfo); } break; } @@ -1706,7 +1710,7 @@ CS << "?"; } CS << "]"; - OutStreamer->AddComment(CS.str()); + OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); } else if (auto *CV = dyn_cast(C)) { CS << "<"; for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) { @@ -1738,7 +1742,7 @@ } } CS << ">"; - OutStreamer->AddComment(CS.str()); + OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); } } break; Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -21,6 +21,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/GlobalISel/GISelAccessor.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/CallingConv.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/Target/TargetMachine.h" @@ -624,6 +625,14 @@ /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } + bool supportPrintSchedInfo() const override { return true; } + + /// The taget schedule model. + TargetSchedModel TSchedModel; + /// Returns string representation of scheduler comment. + virtual std::string getSchedInfoStr(const MachineInstr &MI) const override; + virtual std::string getSchedInfoStr(const MCInst &MCI) const override; + bool enableEarlyIfConversion() const override; /// Return the instruction itineraries based on the subtarget selection. Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -347,6 +347,8 @@ setPICStyle(PICStyles::StubPIC); else if (isTargetELF()) setPICStyle(PICStyles::GOT); + + TSchedModel.init(getSchedModel(), this, getInstrInfo()); } const CallLowering *X86Subtarget::getCallLowering() const { @@ -372,3 +374,34 @@ bool X86Subtarget::enableEarlyIfConversion() const { return hasCMov() && X86EarlyIfConv; } + +static std::string creatSchedInfoStr(unsigned Latency, + Optional RThroughput) { + std::string Comment; + raw_string_ostream CS(Comment); + if (Latency > 0 && RThroughput.hasValue()) + CS << "[" << Latency << format(":%2.2f", RThroughput.getValue()) << "]"; + else if (Latency > 0) + CS << "[" << Latency << ":?]"; + else if (RThroughput.hasValue()) + CS << "[?:" << RThroughput.getValue() << "]"; + CS.flush(); + return Comment; +} + +std::string X86Subtarget::getSchedInfoStr(const MachineInstr &MI) const { + if (MI.isPseudo() || MI.isTerminator()) + return std::string(); + unsigned Latency = TSchedModel.computeInstrLatency(&MI); + Optional RThroughput = TSchedModel.computeInstrRThroughput(&MI); + return creatSchedInfoStr(Latency, RThroughput); +} + +std::string X86Subtarget::getSchedInfoStr(const MCInst &MCI) const { + if (!TSchedModel.hasInstrSchedModel()) + return std::string(); + unsigned Latency = TSchedModel.computeInstrLatency(MCI.getOpcode()); + Optional RThroughput = + TSchedModel.computeInstrRThroughput(MCI.getOpcode()); + return creatSchedInfoStr(Latency, RThroughput); +} Index: test/CodeGen/X86/2011-10-21-widen-cmp.ll =================================================================== --- test/CodeGen/X86/2011-10-21-widen-cmp.ll +++ test/CodeGen/X86/2011-10-21-widen-cmp.ll @@ -7,11 +7,11 @@ define void @cmp_2_floats(<2 x float> %a, <2 x float> %b) { ; CHECK-LABEL: cmp_2_floats: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movaps %xmm0, %xmm2 -; CHECK-NEXT: cmpordps %xmm0, %xmm0 -; CHECK-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; CHECK-NEXT: movlps %xmm1, (%rax) -; CHECK-NEXT: retq +; CHECK-NEXT: movaps %xmm0, %xmm2 # [1:1.00] +; CHECK-NEXT: cmpordps %xmm0, %xmm0 # [3:1.00] +; CHECK-NEXT: blendvps %xmm0, %xmm2, %xmm1 # [1:0.50] +; CHECK-NEXT: movlps %xmm1, (%rax) # [1:1.00] +; CHECK-NEXT: retq # [5:1.00] entry: %0 = fcmp oeq <2 x float> undef, undef %1 = select <2 x i1> %0, <2 x float> %a, <2 x float> %b @@ -22,11 +22,11 @@ define void @cmp_2_doubles(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: cmp_2_doubles: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movapd %xmm0, %xmm2 -; CHECK-NEXT: cmpordpd %xmm0, %xmm0 -; CHECK-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; CHECK-NEXT: movapd %xmm1, (%rax) -; CHECK-NEXT: retq +; CHECK-NEXT: movapd %xmm0, %xmm2 # [1:1.00] +; CHECK-NEXT: cmpordpd %xmm0, %xmm0 # [3:1.00] +; CHECK-NEXT: blendvpd %xmm0, %xmm2, %xmm1 # [1:0.50] +; CHECK-NEXT: movapd %xmm1, (%rax) # [1:1.00] +; CHECK-NEXT: retq # [5:1.00] entry: %0 = fcmp oeq <2 x double> undef, undef %1 = select <2 x i1> %0, <2 x double> %a, <2 x double> %b @@ -38,7 +38,8 @@ ; CHECK-LABEL: mp_11193: ; CHECK: # BB#0: # %allocas ; CHECK-NEXT: movl $-1082130432, (%rsi) # imm = 0xBF800000 -; CHECK-NEXT: retq +; CHECK-NEXT: # [1:1.00] +; CHECK-NEXT: retq # [5:1.00] allocas: %bincmp = fcmp olt <8 x float> , %t = extractelement <8 x i1> %bincmp, i32 0 Index: test/CodeGen/X86/avg.ll =================================================================== --- test/CodeGen/X86/avg.ll +++ test/CodeGen/X86/avg.ll @@ -7,35 +7,35 @@ define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) { ; SSE2-LABEL: avg_v4i8: ; SSE2: # BB#0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: pavgb %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero[4:0.50] +; SSE2-NEXT: pavgb %xmm0, %xmm1 # [1:0.50] +; SSE2-NEXT: movd %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v4i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero[4:0.50] +; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 # [1:0.50] +; AVX2-NEXT: vmovd %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v4i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero[4:0.50] +; AVX512F-NEXT: vpavgb %xmm0, %xmm1, %xmm0 # [1:0.50] +; AVX512F-NEXT: vmovd %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v4i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero[4:0.50] +; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0 # [1:0.50] +; AVX512BW-NEXT: vmovd %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <4 x i8>, <4 x i8>* %a %2 = load <4 x i8>, <4 x i8>* %b %3 = zext <4 x i8> %1 to <4 x i32> @@ -51,35 +51,35 @@ define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) { ; SSE2-LABEL: avg_v8i8: ; SSE2: # BB#0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: pavgb %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; SSE2-NEXT: pavgb %xmm0, %xmm1 # [1:0.50] +; SSE2-NEXT: movq %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v8i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 # [1:0.50] +; AVX2-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v8i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; AVX512F-NEXT: vpavgb %xmm0, %xmm1, %xmm0 # [1:0.50] +; AVX512F-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v8i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0 # [1:0.50] +; AVX512BW-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <8 x i8>, <8 x i8>* %a %2 = load <8 x i8>, <8 x i8>* %b %3 = zext <8 x i8> %1 to <8 x i32> @@ -95,31 +95,31 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { ; SSE2-LABEL: avg_v16i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rsi), %xmm0 # [4:0.50] +; SSE2-NEXT: pavgb (%rdi), %xmm0 # [5:0.50] +; SSE2-NEXT: movdqu %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa (%rsi), %xmm0 # [4:0.50] +; AVX2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # [5:0.50] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqu %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 # [4:0.50] +; AVX512F-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # [5:0.50] +; AVX512F-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512BW-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 # [4:0.50] +; AVX512BW-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # [5:0.50] +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <16 x i8>, <16 x i8>* %a %2 = load <16 x i8>, <16 x i8>* %b %3 = zext <16 x i8> %1 to <16 x i32> @@ -135,115 +135,117 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm8 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 # [4:0.50] +; SSE2-NEXT: movdqa (%rsi), %xmm0 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 # [4:0.50] +; SSE2-NEXT: pxor %xmm4, %xmm4 # [1:0.33] +; SSE2-NEXT: movdqa %xmm8, %xmm10 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15][1:0.50] +; SSE2-NEXT: movdqa %xmm10, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7][1:0.50] +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7][1:0.50] +; SSE2-NEXT: movdqa %xmm8, %xmm12 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3][1:0.50] +; SSE2-NEXT: movdqa %xmm11, %xmm15 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15][1:0.50] +; SSE2-NEXT: movdqa %xmm15, %xmm14 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7][1:0.50] +; SSE2-NEXT: movdqa %xmm11, %xmm9 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3][1:0.50] +; SSE2-NEXT: movdqa %xmm0, %xmm3 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15][1:0.50] +; SSE2-NEXT: movdqa %xmm3, %xmm7 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7][1:0.50] +; SSE2-NEXT: movdqa %xmm0, %xmm6 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15][1:0.50] +; SSE2-NEXT: movdqa %xmm2, %xmm5 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm13 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3][1:0.50] +; SSE2-NEXT: paddd %xmm11, %xmm1 # [1:0.50] +; SSE2-NEXT: paddd %xmm9, %xmm13 # [1:0.50] +; SSE2-NEXT: paddd %xmm15, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm14, %xmm5 # [1:0.50] +; SSE2-NEXT: paddd %xmm8, %xmm0 # [1:0.50] +; SSE2-NEXT: paddd %xmm12, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm10, %xmm3 # [1:0.50] +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1][4:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm7 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm3 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm0 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm5 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm13 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm3 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm7 # [1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0][4:0.50] +; SSE2-NEXT: pand %xmm4, %xmm7 # [1:0.33] +; SSE2-NEXT: pand %xmm4, %xmm3 # [1:0.33] +; SSE2-NEXT: packuswb %xmm7, %xmm3 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm0 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm6 # [1:0.50] +; SSE2-NEXT: pand %xmm4, %xmm6 # [1:0.33] +; SSE2-NEXT: pand %xmm4, %xmm0 # [1:0.33] +; SSE2-NEXT: packuswb %xmm6, %xmm0 # [1:0.50] +; SSE2-NEXT: packuswb %xmm3, %xmm0 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm5 # [1:0.50] +; SSE2-NEXT: pand %xmm4, %xmm5 # [1:0.33] +; SSE2-NEXT: pand %xmm4, %xmm2 # [1:0.33] +; SSE2-NEXT: packuswb %xmm5, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm13 # [1:0.50] +; SSE2-NEXT: pand %xmm4, %xmm13 # [1:0.33] +; SSE2-NEXT: pand %xmm4, %xmm1 # [1:0.33] +; SSE2-NEXT: packuswb %xmm13, %xmm1 # [1:0.50] +; SSE2-NEXT: packuswb %xmm2, %xmm1 # [1:0.50] +; SSE2-NEXT: movdqu %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v32i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 # [4:0.50] +; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 # [5:0.50] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX2-NEXT: vzeroupper # [?:0.000000e+00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v32i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 # [4:0.50] +; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0 # [5:0.50] +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512F-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v32i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512BW-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0 # [4:0.50] +; AVX512BW-NEXT: vpavgb (%rdi), %ymm0, %ymm0 # [5:0.50] +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <32 x i8>, <32 x i8>* %a %2 = load <32 x i8>, <32 x i8>* %b %3 = zext <32 x i8> %1 to <32 x i32> @@ -259,279 +261,317 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; SSE2-LABEL: avg_v64i8: ; SSE2: # BB#0: -; SSE2-NEXT: subq $152, %rsp +; SSE2-NEXT: subq $152, %rsp # [1:0.33] ; SSE2-NEXT: .Lcfi0: ; SSE2-NEXT: .cfi_def_cfa_offset 160 -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm4 -; SSE2-NEXT: movdqa 32(%rdi), %xmm5 -; SSE2-NEXT: movdqa 48(%rdi), %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsi), %xmm14 -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm14, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE2-NEXT: movdqa 16(%rsi), %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm6, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm12, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm5, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa 48(%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload -; SSE2-NEXT: paddd (%rsp), %xmm11 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm0, %xmm15 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm15 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: packuswb %xmm15, %xmm7 -; SSE2-NEXT: psrld $1, %xmm14 -; SSE2-NEXT: psrld $1, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm14 -; SSE2-NEXT: packuswb %xmm9, %xmm14 -; SSE2-NEXT: packuswb %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm0, %xmm13 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm13, %xmm6 -; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: packuswb %xmm10, %xmm12 -; SSE2-NEXT: packuswb %xmm6, %xmm12 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: packuswb %xmm11, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm6, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm5 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm5, %xmm4 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm5, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm12, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) -; SSE2-NEXT: addq $152, %rsp -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm1 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 # [4:0.50] +; SSE2-NEXT: movdqa 32(%rdi), %xmm5 # [4:0.50] +; SSE2-NEXT: movdqa 48(%rdi), %xmm6 # [4:0.50] +; SSE2-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSE2-NEXT: movdqa %xmm1, %xmm3 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm3, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: movdqa %xmm4, %xmm3 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm3, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm4, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: movdqa %xmm5, %xmm3 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm3, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm5, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: movdqa %xmm6, %xmm8 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm8, %xmm1 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm6, %xmm1 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: movdqa (%rsi), %xmm14 # [4:0.50] +; SSE2-NEXT: movdqa %xmm14, %xmm7 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm7, %xmm15 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm14, %xmm9 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa 16(%rsi), %xmm12 # [4:0.50] +; SSE2-NEXT: movdqa %xmm12, %xmm6 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm6, %xmm13 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm12, %xmm10 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa 32(%rsi), %xmm2 # [4:0.50] +; SSE2-NEXT: movdqa %xmm2, %xmm5 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm5, %xmm11 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm2, %xmm1 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa 48(%rsi), %xmm1 # [4:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm4 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm4, %xmm3 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm3 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3][1:0.50] +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: paddd %xmm8, %xmm4 # [1:0.50] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload [4:0.50] +; SSE2-NEXT: # [4:0.50] +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload [4:0.50] +; SSE2-NEXT: # [4:0.50] +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: paddd (%rsp), %xmm11 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1][4:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm15 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm7 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm9 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm14 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm13 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm10 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm12 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm11 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm5 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm3 # [1:0.50] +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: paddd %xmm0, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm8 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm4 # [1:0.50] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload [4:0.50] +; SSE2-NEXT: # [4:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm3 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm7 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm15 # [1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0][4:0.50] +; SSE2-NEXT: pand %xmm0, %xmm15 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm7 # [1:0.33] +; SSE2-NEXT: packuswb %xmm15, %xmm7 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm14 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm9 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm9 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm14 # [1:0.33] +; SSE2-NEXT: packuswb %xmm9, %xmm14 # [1:0.50] +; SSE2-NEXT: packuswb %xmm7, %xmm14 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm6 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm13 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm13 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm6 # [1:0.33] +; SSE2-NEXT: packuswb %xmm13, %xmm6 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm12 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm10 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm10 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm12 # [1:0.33] +; SSE2-NEXT: packuswb %xmm10, %xmm12 # [1:0.50] +; SSE2-NEXT: packuswb %xmm6, %xmm12 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm5 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm11 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm11 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm5 # [1:0.33] +; SSE2-NEXT: packuswb %xmm11, %xmm5 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm2 # [1:0.50] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload [4:0.50] +; SSE2-NEXT: # [4:0.50] +; SSE2-NEXT: psrld $1, %xmm6 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm6 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm2 # [1:0.33] +; SSE2-NEXT: packuswb %xmm6, %xmm2 # [1:0.50] +; SSE2-NEXT: packuswb %xmm5, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm4 # [1:0.50] +; SSE2-NEXT: movdqa %xmm8, %xmm5 # [1:0.33] +; SSE2-NEXT: psrld $1, %xmm5 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm5 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm4 # [1:0.33] +; SSE2-NEXT: packuswb %xmm5, %xmm4 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm1 # [1:0.50] +; SSE2-NEXT: movdqa %xmm3, %xmm5 # [1:0.33] +; SSE2-NEXT: psrld $1, %xmm5 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm5 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm1 # [1:0.33] +; SSE2-NEXT: packuswb %xmm5, %xmm1 # [1:0.50] +; SSE2-NEXT: packuswb %xmm4, %xmm1 # [1:0.50] +; SSE2-NEXT: movdqu %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm2, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm12, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm14, (%rax) # [1:1.00] +; SSE2-NEXT: addq $152, %rsp # [1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v64i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm15, %ymm7, %ymm7 -; AVX2-NEXT: vpaddd %ymm14, %ymm6, %ymm6 -; AVX2-NEXT: vpaddd %ymm13, %ymm5, %ymm5 -; AVX2-NEXT: vpaddd %ymm12, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm11, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm10, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm9, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8 -; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9 -; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10 -; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 -; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm1 -; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm11 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm12 -; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 -; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm6 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm7 -; AVX2-NEXT: vpsrld $1, %ymm10, %ymm8 -; AVX2-NEXT: vpsrld $1, %ymm9, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm0 -; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX2-NEXT: vpshufb %ymm2, %ymm12, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpaddd %ymm15, %ymm7, %ymm7 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm14, %ymm6, %ymm6 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm13, %ymm5, %ymm5 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm12, %ymm4, %ymm4 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm11, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm10, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm9, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8 # [4:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm1 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm0 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm0, %ymm11 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm1, %ymm12 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm3, %ymm6 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm7 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm10, %ymm8 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm9, %ymm3 # [1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31][4:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3][1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>[4:0.50] +; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm0 # [1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm8 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm1 # [1:0.50] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0][1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm1 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 # [1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6 # [1:0.50] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm1 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 # [1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm4 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 # [1:0.50] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0][1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm12, %ymm4 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 # [1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm2 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 # [1:0.50] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 # [1:1.00] +; AVX2-NEXT: vmovdqu %ymm1, (%rax) # [1:1.00] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX2-NEXT: vzeroupper # [?:0.000000e+00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v64i8: ; AVX512F: # BB#0: @@ -558,22 +598,22 @@ ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] ; AVX512F-NEXT: vpmovdb %zmm2, %xmm1 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # [1:1.00] +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) # [1:1.00] +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512F-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v64i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 # [4:0.50] ; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <64 x i8>, <64 x i8>* %a %2 = load <64 x i8>, <64 x i8>* %b %3 = zext <64 x i8> %1 to <64 x i32> @@ -589,35 +629,35 @@ define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) { ; SSE2-LABEL: avg_v4i16: ; SSE2: # BB#0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: pavgw %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; SSE2-NEXT: pavgw %xmm0, %xmm1 # [1:0.50] +; SSE2-NEXT: movq %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v4i16: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 # [1:0.50] +; AVX2-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v4i16: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpavgw %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; AVX512F-NEXT: vpavgw %xmm0, %xmm1, %xmm0 # [1:0.50] +; AVX512F-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v4i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vpavgw %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; AVX512BW-NEXT: vpavgw %xmm0, %xmm1, %xmm0 # [1:0.50] +; AVX512BW-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <4 x i16>, <4 x i16>* %a %2 = load <4 x i16>, <4 x i16>* %b %3 = zext <4 x i16> %1 to <4 x i32> @@ -633,31 +673,31 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { ; SSE2-LABEL: avg_v8i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rsi), %xmm0 # [4:0.50] +; SSE2-NEXT: pavgw (%rdi), %xmm0 # [5:0.50] +; SSE2-NEXT: movdqu %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-NEXT: vpavgw (%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa (%rsi), %xmm0 # [4:0.50] +; AVX2-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # [5:0.50] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v8i16: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-NEXT: vpavgw (%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqu %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 # [4:0.50] +; AVX512F-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # [5:0.50] +; AVX512F-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v8i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512BW-NEXT: vpavgw (%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 # [4:0.50] +; AVX512BW-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # [5:0.50] +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <8 x i16>, <8 x i16>* %a %2 = load <8 x i16>, <8 x i16>* %b %3 = zext <8 x i16> %1 to <8 x i32> @@ -673,73 +713,73 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm3, %xmm0 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm4 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rdi), %xmm5 # [4:0.50] +; SSE2-NEXT: movdqa (%rsi), %xmm0 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 # [4:0.50] +; SSE2-NEXT: pxor %xmm6, %xmm6 # [1:0.33] +; SSE2-NEXT: movdqa %xmm4, %xmm8 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3][1:0.50] +; SSE2-NEXT: movdqa %xmm5, %xmm7 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3][1:0.50] +; SSE2-NEXT: movdqa %xmm0, %xmm3 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3][1:0.50] +; SSE2-NEXT: paddd %xmm5, %xmm1 # [1:0.50] +; SSE2-NEXT: paddd %xmm7, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm0 # [1:0.50] +; SSE2-NEXT: paddd %xmm8, %xmm3 # [1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1][4:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm3 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm0 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm0 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm3 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm3 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm3 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm0 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm0 # [1:0.50] +; SSE2-NEXT: packssdw %xmm3, %xmm0 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm2 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm2 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm1 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm1 # [1:0.50] +; SSE2-NEXT: packssdw %xmm2, %xmm1 # [1:0.50] +; SSE2-NEXT: movdqu %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v16i16: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 # [4:0.50] +; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 # [5:0.50] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX2-NEXT: vzeroupper # [?:0.000000e+00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v16i16: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 # [4:0.50] +; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 # [5:0.50] +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512F-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v16i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512BW-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0 # [4:0.50] +; AVX512BW-NEXT: vpavgw (%rdi), %ymm0, %ymm0 # [5:0.50] +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <16 x i16>, <16 x i16>* %a %2 = load <16 x i16>, <16 x i16>* %b %3 = zext <16 x i16> %1 to <16 x i32> @@ -755,129 +795,131 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 -; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm8, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: pslld $16, %xmm7 -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: pslld $16, %xmm4 -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: packssdw %xmm4, %xmm3 -; SSE2-NEXT: movdqu %xmm3, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm10 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rdi), %xmm9 # [4:0.50] +; SSE2-NEXT: movdqa 32(%rdi), %xmm11 # [4:0.50] +; SSE2-NEXT: movdqa 48(%rdi), %xmm8 # [4:0.50] +; SSE2-NEXT: movdqa (%rsi), %xmm14 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 # [4:0.50] +; SSE2-NEXT: movdqa 32(%rsi), %xmm2 # [4:0.50] +; SSE2-NEXT: movdqa 48(%rsi), %xmm3 # [4:0.50] +; SSE2-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSE2-NEXT: movdqa %xmm10, %xmm4 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm9, %xmm12 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm11, %xmm15 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm8, %xmm13 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm14, %xmm7 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm6 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm2, %xmm5 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm3, %xmm4 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3][1:0.50] +; SSE2-NEXT: paddd %xmm8, %xmm3 # [1:0.50] +; SSE2-NEXT: paddd %xmm13, %xmm4 # [1:0.50] +; SSE2-NEXT: paddd %xmm11, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm15, %xmm5 # [1:0.50] +; SSE2-NEXT: paddd %xmm9, %xmm1 # [1:0.50] +; SSE2-NEXT: paddd %xmm12, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm10, %xmm14 # [1:0.50] +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1][4:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm7 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm14 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm1 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm5 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm4 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm3 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm14 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm7 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm7 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm7 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm14 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm14 # [1:0.50] +; SSE2-NEXT: packssdw %xmm7, %xmm14 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm6 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm6 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm6 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm1 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm1 # [1:0.50] +; SSE2-NEXT: packssdw %xmm6, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm5 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm5 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm5 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm2 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm2 # [1:0.50] +; SSE2-NEXT: packssdw %xmm5, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm3 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm4 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm4 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm4 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm3 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm3 # [1:0.50] +; SSE2-NEXT: packssdw %xmm4, %xmm3 # [1:0.50] +; SSE2-NEXT: movdqu %xmm3, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm2, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm14, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v32i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 # [4:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31][4:0.50] +; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3][1:1.00] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm1 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm2 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3][1:1.00] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # [1:1.00] +; AVX2-NEXT: vmovdqu %ymm1, (%rax) # [1:1.00] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX2-NEXT: vzeroupper # [?:0.000000e+00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v32i16: ; AVX512F: # BB#0: @@ -894,16 +936,16 @@ ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, (%rax) ; AVX512F-NEXT: vpmovdw %zmm1, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v32i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 # [4:0.50] ; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <32 x i16>, <32 x i16>* %a %2 = load <32 x i16>, <32 x i16>* %b %3 = zext <32 x i16> %1 to <32 x i32> @@ -919,35 +961,35 @@ define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) { ; SSE2-LABEL: avg_v4i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: pavgb %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero[4:0.50] +; SSE2-NEXT: pavgb %xmm0, %xmm1 # [1:0.50] +; SSE2-NEXT: movd %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v4i8_2: ; AVX2: # BB#0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero[4:0.50] +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vmovd %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v4i8_2: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero[4:0.50] +; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX512F-NEXT: vmovd %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v4i8_2: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero[4:0.50] +; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX512BW-NEXT: vmovd %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <4 x i8>, <4 x i8>* %a %2 = load <4 x i8>, <4 x i8>* %b %3 = zext <4 x i8> %1 to <4 x i32> @@ -963,35 +1005,35 @@ define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) { ; SSE2-LABEL: avg_v8i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: pavgb %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; SSE2-NEXT: pavgb %xmm0, %xmm1 # [1:0.50] +; SSE2-NEXT: movq %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v8i8_2: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v8i8_2: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX512F-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v8i8_2: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX512BW-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <8 x i8>, <8 x i8>* %a %2 = load <8 x i8>, <8 x i8>* %b %3 = zext <8 x i8> %1 to <8 x i32> @@ -1007,31 +1049,31 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { ; SSE2-LABEL: avg_v16i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pavgb (%rsi), %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm0 # [4:0.50] +; SSE2-NEXT: pavgb (%rsi), %xmm0 # [5:0.50] +; SSE2-NEXT: movdqu %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v16i8_2: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 # [4:0.50] +; AVX2-NEXT: vpavgb (%rsi), %xmm0, %xmm0 # [5:0.50] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v16i8_2: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqu %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 # [4:0.50] +; AVX512F-NEXT: vpavgb (%rsi), %xmm0, %xmm0 # [5:0.50] +; AVX512F-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v16i8_2: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 # [4:0.50] +; AVX512BW-NEXT: vpavgb (%rsi), %xmm0, %xmm0 # [5:0.50] +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <16 x i8>, <16 x i8>* %a %2 = load <16 x i8>, <16 x i8>* %b %3 = zext <16 x i8> %1 to <16 x i32> @@ -1047,115 +1089,117 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm8 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 # [4:0.50] +; SSE2-NEXT: movdqa (%rsi), %xmm0 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 # [4:0.50] +; SSE2-NEXT: pxor %xmm4, %xmm4 # [1:0.33] +; SSE2-NEXT: movdqa %xmm8, %xmm10 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15][1:0.50] +; SSE2-NEXT: movdqa %xmm10, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7][1:0.50] +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7][1:0.50] +; SSE2-NEXT: movdqa %xmm8, %xmm12 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3][1:0.50] +; SSE2-NEXT: movdqa %xmm11, %xmm15 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15][1:0.50] +; SSE2-NEXT: movdqa %xmm15, %xmm14 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7][1:0.50] +; SSE2-NEXT: movdqa %xmm11, %xmm9 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3][1:0.50] +; SSE2-NEXT: movdqa %xmm0, %xmm3 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15][1:0.50] +; SSE2-NEXT: movdqa %xmm3, %xmm7 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7][1:0.50] +; SSE2-NEXT: movdqa %xmm0, %xmm6 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15][1:0.50] +; SSE2-NEXT: movdqa %xmm2, %xmm5 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm13 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3][1:0.50] +; SSE2-NEXT: paddd %xmm11, %xmm1 # [1:0.50] +; SSE2-NEXT: paddd %xmm9, %xmm13 # [1:0.50] +; SSE2-NEXT: paddd %xmm15, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm14, %xmm5 # [1:0.50] +; SSE2-NEXT: paddd %xmm8, %xmm0 # [1:0.50] +; SSE2-NEXT: paddd %xmm12, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm10, %xmm3 # [1:0.50] +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1][4:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm7 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm3 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm0 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm5 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm13 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm3 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm7 # [1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0][4:0.50] +; SSE2-NEXT: pand %xmm4, %xmm7 # [1:0.33] +; SSE2-NEXT: pand %xmm4, %xmm3 # [1:0.33] +; SSE2-NEXT: packuswb %xmm7, %xmm3 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm0 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm6 # [1:0.50] +; SSE2-NEXT: pand %xmm4, %xmm6 # [1:0.33] +; SSE2-NEXT: pand %xmm4, %xmm0 # [1:0.33] +; SSE2-NEXT: packuswb %xmm6, %xmm0 # [1:0.50] +; SSE2-NEXT: packuswb %xmm3, %xmm0 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm5 # [1:0.50] +; SSE2-NEXT: pand %xmm4, %xmm5 # [1:0.33] +; SSE2-NEXT: pand %xmm4, %xmm2 # [1:0.33] +; SSE2-NEXT: packuswb %xmm5, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm13 # [1:0.50] +; SSE2-NEXT: pand %xmm4, %xmm13 # [1:0.33] +; SSE2-NEXT: pand %xmm4, %xmm1 # [1:0.33] +; SSE2-NEXT: packuswb %xmm13, %xmm1 # [1:0.50] +; SSE2-NEXT: packuswb %xmm2, %xmm1 # [1:0.50] +; SSE2-NEXT: movdqu %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v32i8_2: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 # [4:0.50] +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 # [5:0.50] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX2-NEXT: vzeroupper # [?:0.000000e+00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v32i8_2: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 # [4:0.50] +; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0 # [5:0.50] +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512F-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v32i8_2: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpavgb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 # [4:0.50] +; AVX512BW-NEXT: vpavgb (%rsi), %ymm0, %ymm0 # [5:0.50] +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <32 x i8>, <32 x i8>* %a %2 = load <32 x i8>, <32 x i8>* %b %3 = zext <32 x i8> %1 to <32 x i32> @@ -1171,209 +1215,215 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) { ; SSE2-LABEL: avg_v64i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rsi), %xmm14 -; SSE2-NEXT: movdqa 16(%rsi), %xmm12 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm14, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm12, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm6, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm5, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: paddd %xmm1, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm4, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm3, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm2 -; SSE2-NEXT: paddd %xmm10, %xmm10 -; SSE2-NEXT: paddd %xmm5, %xmm5 -; SSE2-NEXT: paddd %xmm11, %xmm11 -; SSE2-NEXT: paddd %xmm12, %xmm12 -; SSE2-NEXT: paddd %xmm9, %xmm9 -; SSE2-NEXT: paddd %xmm6, %xmm6 -; SSE2-NEXT: paddd %xmm13, %xmm13 -; SSE2-NEXT: paddd %xmm14, %xmm14 -; SSE2-NEXT: paddd %xmm8, %xmm8 -; SSE2-NEXT: paddd %xmm7, %xmm7 -; SSE2-NEXT: paddd %xmm15, %xmm15 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm0, %xmm15 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm15 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: packuswb %xmm15, %xmm7 -; SSE2-NEXT: psrld $1, %xmm14 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: pand %xmm0, %xmm8 -; SSE2-NEXT: pand %xmm0, %xmm14 -; SSE2-NEXT: packuswb %xmm8, %xmm14 -; SSE2-NEXT: packuswb %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm0, %xmm13 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm13, %xmm6 -; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: psrld $1, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: packuswb %xmm9, %xmm12 -; SSE2-NEXT: packuswb %xmm6, %xmm12 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: packuswb %xmm11, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm10, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm5, %xmm4 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm5, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm12, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rsi), %xmm14 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rsi), %xmm12 # [4:0.50] +; SSE2-NEXT: movdqa 32(%rsi), %xmm2 # [4:0.50] +; SSE2-NEXT: movdqa 48(%rsi), %xmm1 # [4:0.50] +; SSE2-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSE2-NEXT: movdqa %xmm14, %xmm7 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm7, %xmm15 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm14, %xmm8 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm12, %xmm6 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm6, %xmm13 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm12, %xmm9 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm2, %xmm5 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm5, %xmm11 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm2, %xmm10 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm4 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm4, %xmm3 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm3 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3][1:0.50] +; SSE2-NEXT: paddd %xmm1, %xmm1 # [1:0.50] +; SSE2-NEXT: paddd %xmm3, %xmm3 # [1:0.50] +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: paddd %xmm4, %xmm4 # [1:0.50] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload [4:0.50] +; SSE2-NEXT: # [4:0.50] +; SSE2-NEXT: paddd %xmm3, %xmm3 # [1:0.50] +; SSE2-NEXT: paddd %xmm2, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm10, %xmm10 # [1:0.50] +; SSE2-NEXT: paddd %xmm5, %xmm5 # [1:0.50] +; SSE2-NEXT: paddd %xmm11, %xmm11 # [1:0.50] +; SSE2-NEXT: paddd %xmm12, %xmm12 # [1:0.50] +; SSE2-NEXT: paddd %xmm9, %xmm9 # [1:0.50] +; SSE2-NEXT: paddd %xmm6, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm13, %xmm13 # [1:0.50] +; SSE2-NEXT: paddd %xmm14, %xmm14 # [1:0.50] +; SSE2-NEXT: paddd %xmm8, %xmm8 # [1:0.50] +; SSE2-NEXT: paddd %xmm7, %xmm7 # [1:0.50] +; SSE2-NEXT: paddd %xmm15, %xmm15 # [1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1][4:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm15 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm7 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm8 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm14 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm13 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm9 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm12 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm11 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm5 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm10 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm3 # [1:0.50] +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: paddd %xmm0, %xmm4 # [1:0.50] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload [4:0.50] +; SSE2-NEXT: # [4:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm3 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm7 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm15 # [1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0][4:0.50] +; SSE2-NEXT: pand %xmm0, %xmm15 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm7 # [1:0.33] +; SSE2-NEXT: packuswb %xmm15, %xmm7 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm14 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm8 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm8 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm14 # [1:0.33] +; SSE2-NEXT: packuswb %xmm8, %xmm14 # [1:0.50] +; SSE2-NEXT: packuswb %xmm7, %xmm14 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm6 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm13 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm13 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm6 # [1:0.33] +; SSE2-NEXT: packuswb %xmm13, %xmm6 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm12 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm9 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm9 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm12 # [1:0.33] +; SSE2-NEXT: packuswb %xmm9, %xmm12 # [1:0.50] +; SSE2-NEXT: packuswb %xmm6, %xmm12 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm5 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm11 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm11 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm5 # [1:0.33] +; SSE2-NEXT: packuswb %xmm11, %xmm5 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm10 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm10 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm2 # [1:0.33] +; SSE2-NEXT: packuswb %xmm10, %xmm2 # [1:0.50] +; SSE2-NEXT: packuswb %xmm5, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm4 # [1:0.50] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload [4:0.50] +; SSE2-NEXT: # [4:0.50] +; SSE2-NEXT: psrld $1, %xmm5 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm5 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm4 # [1:0.33] +; SSE2-NEXT: packuswb %xmm5, %xmm4 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm1 # [1:0.50] +; SSE2-NEXT: movdqa %xmm3, %xmm5 # [1:0.33] +; SSE2-NEXT: psrld $1, %xmm5 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm5 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm1 # [1:0.33] +; SSE2-NEXT: packuswb %xmm5, %xmm1 # [1:0.50] +; SSE2-NEXT: packuswb %xmm4, %xmm1 # [1:0.50] +; SSE2-NEXT: movdqu %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm2, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm12, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm14, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v64i8_2: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm7, %ymm7, %ymm7 -; AVX2-NEXT: vpaddd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpaddd %ymm5, %ymm5, %ymm5 -; AVX2-NEXT: vpaddd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm3, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8 -; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9 -; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10 -; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 -; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm1 -; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm11 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm12 -; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 -; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm6 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm7 -; AVX2-NEXT: vpsrld $1, %ymm10, %ymm8 -; AVX2-NEXT: vpsrld $1, %ymm9, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm0 -; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX2-NEXT: vpshufb %ymm2, %ymm12, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpaddd %ymm7, %ymm7, %ymm7 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm6, %ymm6, %ymm6 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm5, %ymm5, %ymm5 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm4, %ymm4 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm3, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm2, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm1, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8 # [4:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm1 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm0 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm0, %ymm11 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm1, %ymm12 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm3, %ymm6 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm7 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm10, %ymm8 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm9, %ymm3 # [1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31][4:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3][1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>[4:0.50] +; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm0 # [1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm8 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm1 # [1:0.50] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0][1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm1 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 # [1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6 # [1:0.50] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm1 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 # [1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm4 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 # [1:0.50] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0][1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm12, %ymm4 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 # [1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm2 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 # [1:0.50] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 # [1:1.00] +; AVX2-NEXT: vmovdqu %ymm1, (%rax) # [1:1.00] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX2-NEXT: vzeroupper # [?:0.000000e+00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v64i8_2: ; AVX512F: # BB#0: @@ -1396,22 +1446,22 @@ ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] ; AVX512F-NEXT: vpmovdb %zmm2, %xmm1 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # [1:1.00] +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) # [1:1.00] +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512F-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v64i8_2: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 # [4:0.50] ; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <64 x i8>, <64 x i8>* %a %2 = load <64 x i8>, <64 x i8>* %b %3 = zext <64 x i8> %1 to <64 x i32> @@ -1428,35 +1478,35 @@ define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) { ; SSE2-LABEL: avg_v4i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: pavgw %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; SSE2-NEXT: pavgw %xmm0, %xmm1 # [1:0.50] +; SSE2-NEXT: movq %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v4i16_2: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v4i16_2: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX512F-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v4i16_2: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vpavgw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero[4:0.50] +; AVX512BW-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX512BW-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <4 x i16>, <4 x i16>* %a %2 = load <4 x i16>, <4 x i16>* %b %3 = zext <4 x i16> %1 to <4 x i32> @@ -1472,31 +1522,31 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { ; SSE2-LABEL: avg_v8i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pavgw (%rsi), %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm0 # [4:0.50] +; SSE2-NEXT: pavgw (%rsi), %xmm0 # [5:0.50] +; SSE2-NEXT: movdqu %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v8i16_2: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 # [4:0.50] +; AVX2-NEXT: vpavgw (%rsi), %xmm0, %xmm0 # [5:0.50] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v8i16_2: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqu %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 # [4:0.50] +; AVX512F-NEXT: vpavgw (%rsi), %xmm0, %xmm0 # [5:0.50] +; AVX512F-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v8i16_2: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 # [4:0.50] +; AVX512BW-NEXT: vpavgw (%rsi), %xmm0, %xmm0 # [5:0.50] +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <8 x i16>, <8 x i16>* %a %2 = load <8 x i16>, <8 x i16>* %b %3 = zext <8 x i16> %1 to <8 x i32> @@ -1512,73 +1562,73 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm3, %xmm0 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm4 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rdi), %xmm5 # [4:0.50] +; SSE2-NEXT: movdqa (%rsi), %xmm0 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 # [4:0.50] +; SSE2-NEXT: pxor %xmm6, %xmm6 # [1:0.33] +; SSE2-NEXT: movdqa %xmm4, %xmm8 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3][1:0.50] +; SSE2-NEXT: movdqa %xmm5, %xmm7 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3][1:0.50] +; SSE2-NEXT: movdqa %xmm0, %xmm3 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3][1:0.50] +; SSE2-NEXT: paddd %xmm5, %xmm1 # [1:0.50] +; SSE2-NEXT: paddd %xmm7, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm0 # [1:0.50] +; SSE2-NEXT: paddd %xmm8, %xmm3 # [1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1][4:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm3 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm0 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm0 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm3 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm3 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm3 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm0 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm0 # [1:0.50] +; SSE2-NEXT: packssdw %xmm3, %xmm0 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm2 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm2 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm1 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm1 # [1:0.50] +; SSE2-NEXT: packssdw %xmm2, %xmm1 # [1:0.50] +; SSE2-NEXT: movdqu %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v16i16_2: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 # [4:0.50] +; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 # [5:0.50] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX2-NEXT: vzeroupper # [?:0.000000e+00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v16i16_2: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 # [4:0.50] +; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 # [5:0.50] +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512F-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v16i16_2: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 # [4:0.50] +; AVX512BW-NEXT: vpavgw (%rsi), %ymm0, %ymm0 # [5:0.50] +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <16 x i16>, <16 x i16>* %a %2 = load <16 x i16>, <16 x i16>* %b %3 = zext <16 x i16> %1 to <16 x i32> @@ -1594,129 +1644,131 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 -; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm8, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: pslld $16, %xmm7 -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: pslld $16, %xmm4 -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: packssdw %xmm4, %xmm3 -; SSE2-NEXT: movdqu %xmm3, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm10 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rdi), %xmm9 # [4:0.50] +; SSE2-NEXT: movdqa 32(%rdi), %xmm11 # [4:0.50] +; SSE2-NEXT: movdqa 48(%rdi), %xmm8 # [4:0.50] +; SSE2-NEXT: movdqa (%rsi), %xmm14 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 # [4:0.50] +; SSE2-NEXT: movdqa 32(%rsi), %xmm2 # [4:0.50] +; SSE2-NEXT: movdqa 48(%rsi), %xmm3 # [4:0.50] +; SSE2-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSE2-NEXT: movdqa %xmm10, %xmm4 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm9, %xmm12 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm11, %xmm15 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm8, %xmm13 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm14, %xmm7 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm6 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm2, %xmm5 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm3, %xmm4 # [1:0.33] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3][1:0.50] +; SSE2-NEXT: paddd %xmm8, %xmm3 # [1:0.50] +; SSE2-NEXT: paddd %xmm13, %xmm4 # [1:0.50] +; SSE2-NEXT: paddd %xmm11, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm15, %xmm5 # [1:0.50] +; SSE2-NEXT: paddd %xmm9, %xmm1 # [1:0.50] +; SSE2-NEXT: paddd %xmm12, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm10, %xmm14 # [1:0.50] +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload [5:0.50] +; SSE2-NEXT: # [5:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1][4:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm7 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm14 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm1 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm5 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm4 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm3 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm14 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm7 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm7 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm7 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm14 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm14 # [1:0.50] +; SSE2-NEXT: packssdw %xmm7, %xmm14 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm6 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm6 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm6 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm1 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm1 # [1:0.50] +; SSE2-NEXT: packssdw %xmm6, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm5 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm5 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm5 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm2 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm2 # [1:0.50] +; SSE2-NEXT: packssdw %xmm5, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm3 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm4 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm4 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm4 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm3 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm3 # [1:0.50] +; SSE2-NEXT: packssdw %xmm4, %xmm3 # [1:0.50] +; SSE2-NEXT: movdqu %xmm3, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm2, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm14, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v32i16_2: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 # [4:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31][4:0.50] +; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3][1:1.00] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm1 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm2 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3][1:1.00] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # [1:1.00] +; AVX2-NEXT: vmovdqu %ymm1, (%rax) # [1:1.00] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX2-NEXT: vzeroupper # [?:0.000000e+00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v32i16_2: ; AVX512F: # BB#0: @@ -1733,16 +1785,16 @@ ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, (%rax) ; AVX512F-NEXT: vpmovdw %zmm1, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v32i16_2: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 # [4:0.50] ; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <32 x i16>, <32 x i16>* %a %2 = load <32 x i16>, <32 x i16>* %b %3 = zext <32 x i16> %1 to <32 x i32> @@ -1758,31 +1810,31 @@ define void @avg_v4i8_const(<4 x i8>* %a) { ; SSE2-LABEL: avg_v4i8_const: ; SSE2: # BB#0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movd %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 # [5:0.50] +; SSE2-NEXT: movd %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v4i8_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX2-NEXT: vmovd %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v4i8_const: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX512F-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX512F-NEXT: vmovd %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v4i8_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX512BW-NEXT: vmovd %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <4 x i8>, <4 x i8>* %a %2 = zext <4 x i8> %1 to <4 x i32> %3 = add nuw nsw <4 x i32> %2, @@ -1795,31 +1847,31 @@ define void @avg_v8i8_const(<8 x i8>* %a) { ; SSE2-LABEL: avg_v8i8_const: ; SSE2: # BB#0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movq %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 # [5:0.50] +; SSE2-NEXT: movq %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v8i8_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX2-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v8i8_const: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX512F-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX512F-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v8i8_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX512BW-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <8 x i8>, <8 x i8>* %a %2 = zext <8 x i8> %1 to <8 x i32> %3 = add nuw nsw <8 x i32> %2, @@ -1832,31 +1884,31 @@ define void @avg_v16i8_const(<16 x i8>* %a) { ; SSE2-LABEL: avg_v16i8_const: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm0 # [4:0.50] +; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 # [5:0.50] +; SSE2-NEXT: movdqu %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v16i8_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 # [4:0.50] +; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v16i8_const: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqu %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 # [4:0.50] +; AVX512F-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX512F-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v16i8_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 # [4:0.50] +; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <16 x i8>, <16 x i8>* %a %2 = zext <16 x i8> %1 to <16 x i32> %3 = add nuw nsw <16 x i32> %2, @@ -1869,87 +1921,87 @@ define void @avg_v32i8_const(<32 x i8>* %a) { ; SSE2-LABEL: avg_v32i8_const: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm5 -; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [5,6,7,8] -; SSE2-NEXT: paddd %xmm9, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,3,4] -; SSE2-NEXT: paddd %xmm3, %xmm7 -; SSE2-NEXT: paddd %xmm9, %xmm6 -; SSE2-NEXT: paddd %xmm3, %xmm4 -; SSE2-NEXT: paddd %xmm9, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm8 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: packuswb %xmm5, %xmm7 -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: packuswb %xmm6, %xmm4 -; SSE2-NEXT: packuswb %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: packuswb %xmm2, %xmm8 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm8, %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: movdqu %xmm4, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm5 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 # [4:0.50] +; SSE2-NEXT: pxor %xmm3, %xmm3 # [1:0.33] +; SSE2-NEXT: movdqa %xmm2, %xmm1 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm0 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7][1:0.50] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15][1:0.50] +; SSE2-NEXT: movdqa %xmm2, %xmm8 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7][1:0.50] +; SSE2-NEXT: movdqa %xmm5, %xmm6 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7][1:0.50] +; SSE2-NEXT: movdqa %xmm6, %xmm4 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7][1:0.50] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15][1:0.50] +; SSE2-NEXT: movdqa %xmm5, %xmm7 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7][1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [5,6,7,8][4:0.50] +; SSE2-NEXT: paddd %xmm9, %xmm5 # [1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,3,4][4:0.50] +; SSE2-NEXT: paddd %xmm3, %xmm7 # [1:0.50] +; SSE2-NEXT: paddd %xmm9, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm3, %xmm4 # [1:0.50] +; SSE2-NEXT: paddd %xmm9, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm3, %xmm8 # [1:0.50] +; SSE2-NEXT: paddd %xmm9, %xmm1 # [1:0.50] +; SSE2-NEXT: paddd %xmm3, %xmm0 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm0 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm8 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm4 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm6 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm7 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm5 # [1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0][4:0.50] +; SSE2-NEXT: pand %xmm3, %xmm5 # [1:0.33] +; SSE2-NEXT: pand %xmm3, %xmm7 # [1:0.33] +; SSE2-NEXT: packuswb %xmm5, %xmm7 # [1:0.50] +; SSE2-NEXT: pand %xmm3, %xmm6 # [1:0.33] +; SSE2-NEXT: pand %xmm3, %xmm4 # [1:0.33] +; SSE2-NEXT: packuswb %xmm6, %xmm4 # [1:0.50] +; SSE2-NEXT: packuswb %xmm7, %xmm4 # [1:0.50] +; SSE2-NEXT: pand %xmm3, %xmm2 # [1:0.33] +; SSE2-NEXT: pand %xmm3, %xmm8 # [1:0.33] +; SSE2-NEXT: packuswb %xmm2, %xmm8 # [1:0.50] +; SSE2-NEXT: pand %xmm3, %xmm1 # [1:0.33] +; SSE2-NEXT: pand %xmm3, %xmm0 # [1:0.33] +; SSE2-NEXT: packuswb %xmm1, %xmm0 # [1:0.50] +; SSE2-NEXT: packuswb %xmm8, %xmm0 # [1:0.50] +; SSE2-NEXT: movdqu %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm4, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v32i8_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 # [4:0.50] +; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 # [5:0.50] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX2-NEXT: vzeroupper # [?:0.000000e+00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v32i8_const: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 # [4:0.50] +; AVX512F-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 # [5:0.50] +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512F-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v32i8_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 # [4:0.50] +; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 # [5:0.50] +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <32 x i8>, <32 x i8>* %a %2 = zext <32 x i8> %1 to <32 x i32> %3 = add nuw nsw <32 x i32> %2, @@ -1962,186 +2014,192 @@ define void @avg_v64i8_const(<64 x i8>* %a) { ; SSE2-LABEL: avg_v64i8_const: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm5 -; SSE2-NEXT: movdqa 16(%rdi), %xmm6 -; SSE2-NEXT: movdqa 32(%rdi), %xmm15 -; SSE2-NEXT: movdqa 48(%rdi), %xmm11 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm14, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm15, %xmm12 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [5,6,7,8] -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm15 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,2,3,4] -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: packuswb %xmm5, %xmm7 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: packuswb %xmm7, %xmm1 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm6, %xmm4 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm8 -; SSE2-NEXT: packuswb %xmm3, %xmm8 -; SSE2-NEXT: packuswb %xmm4, %xmm8 -; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: psrld $1, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: packuswb %xmm15, %xmm12 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: psrld $1, %xmm14 -; SSE2-NEXT: pand %xmm0, %xmm14 -; SSE2-NEXT: pand %xmm0, %xmm13 -; SSE2-NEXT: packuswb %xmm14, %xmm13 -; SSE2-NEXT: packuswb %xmm12, %xmm13 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm11, %xmm2 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: packuswb %xmm3, %xmm10 -; SSE2-NEXT: packuswb %xmm2, %xmm10 -; SSE2-NEXT: movdqu %xmm10, (%rax) -; SSE2-NEXT: movdqu %xmm13, (%rax) -; SSE2-NEXT: movdqu %xmm8, (%rax) -; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm5 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rdi), %xmm6 # [4:0.50] +; SSE2-NEXT: movdqa 32(%rdi), %xmm15 # [4:0.50] +; SSE2-NEXT: movdqa 48(%rdi), %xmm11 # [4:0.50] +; SSE2-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSE2-NEXT: movdqa %xmm11, %xmm1 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm10 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm1, %xmm9 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm11, %xmm1 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3][1:0.50] +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm15, %xmm14 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm14, %xmm13 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm15, %xmm12 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm6, %xmm3 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm3, %xmm8 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm6, %xmm4 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm5, %xmm2 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa %xmm2, %xmm1 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7][1:0.50] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15][1:0.50] +; SSE2-NEXT: movdqa %xmm5, %xmm7 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7][1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [5,6,7,8][4:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm5 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm3 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm15 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm14 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm11 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm9 # [1:0.50] +; SSE2-NEXT: movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,2,3,4][4:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm7 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm1 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm4 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm8 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm12 # [1:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm13 # [1:0.50] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload [4:0.50] +; SSE2-NEXT: # [4:0.50] +; SSE2-NEXT: paddd %xmm0, %xmm9 # [1:0.50] +; SSE2-NEXT: movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; SSE2-NEXT: # [1:1.00] +; SSE2-NEXT: paddd %xmm0, %xmm10 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm7 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm5 # [1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0][4:0.50] +; SSE2-NEXT: pand %xmm0, %xmm5 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm7 # [1:0.33] +; SSE2-NEXT: packuswb %xmm5, %xmm7 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm2 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm2 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm1 # [1:0.33] +; SSE2-NEXT: packuswb %xmm2, %xmm1 # [1:0.50] +; SSE2-NEXT: packuswb %xmm7, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm4 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm6 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm6 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm4 # [1:0.33] +; SSE2-NEXT: packuswb %xmm6, %xmm4 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm8 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm3 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm3 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm8 # [1:0.33] +; SSE2-NEXT: packuswb %xmm3, %xmm8 # [1:0.50] +; SSE2-NEXT: packuswb %xmm4, %xmm8 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm12 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm15 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm15 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm12 # [1:0.33] +; SSE2-NEXT: packuswb %xmm15, %xmm12 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm13 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm14 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm14 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm13 # [1:0.33] +; SSE2-NEXT: packuswb %xmm14, %xmm13 # [1:0.50] +; SSE2-NEXT: packuswb %xmm12, %xmm13 # [1:0.50] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload [4:0.50] +; SSE2-NEXT: # [4:0.50] +; SSE2-NEXT: psrld $1, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm11 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm11 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm2 # [1:0.33] +; SSE2-NEXT: packuswb %xmm11, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm10 # [1:0.50] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload [4:0.50] +; SSE2-NEXT: # [4:0.50] +; SSE2-NEXT: psrld $1, %xmm3 # [1:0.50] +; SSE2-NEXT: pand %xmm0, %xmm3 # [1:0.33] +; SSE2-NEXT: pand %xmm0, %xmm10 # [1:0.33] +; SSE2-NEXT: packuswb %xmm3, %xmm10 # [1:0.50] +; SSE2-NEXT: packuswb %xmm2, %xmm10 # [1:0.50] +; SSE2-NEXT: movdqu %xmm10, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm13, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm8, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v64i8_const: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [1,2,3,4,5,6,7,8] -; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7 -; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 -; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 -; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm8 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm9 -; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 -; AVX2-NEXT: vpsrld $1, %ymm6, %ymm6 -; AVX2-NEXT: vpsrld $1, %ymm7, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 -; AVX2-NEXT: vpshufb %ymm2, %ymm9, %ymm5 -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm6 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vmovdqu %ymm4, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero[5:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [1,2,3,4,5,6,7,8][4:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm8 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm3, %ymm9 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm6, %ymm6 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm7, %ymm3 # [1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31][4:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,2,3][1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>[4:0.50] +; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm7 # [1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6 # [1:0.50] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0][1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 # [1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm4 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 # [1:0.50] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 # [1:1.00] +; AVX2-NEXT: vpshufb %ymm2, %ymm9, %ymm5 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 # [1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm6 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6 # [1:0.50] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0][1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 # [1:0.50] +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX2-NEXT: vmovdqu %ymm4, (%rax) # [1:1.00] +; AVX2-NEXT: vzeroupper # [?:0.000000e+00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v64i8_const: ; AVX512F: # BB#0: @@ -2160,22 +2218,22 @@ ; AVX512F-NEXT: vpsrld $1, %zmm3, %zmm3 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 # [1:1.00] ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vmovdqu %ymm2, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512F-NEXT: vmovdqu %ymm2, (%rax) # [1:1.00] +; AVX512F-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v64i8_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 # [4:0.50] ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <64 x i8>, <64 x i8>* %a %2 = zext <64 x i8> %1 to <64 x i32> %3 = add nuw nsw <64 x i32> %2, @@ -2188,31 +2246,31 @@ define void @avg_v4i16_const(<4 x i16>* %a) { ; SSE2-LABEL: avg_v4i16_const: ; SSE2: # BB#0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movq %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0 # [5:0.50] +; SSE2-NEXT: movq %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v4i16_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX2-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX2-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v4i16_const: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX512F-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX512F-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v4i16_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero[4:0.50] +; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX512BW-NEXT: vmovq %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <4 x i16>, <4 x i16>* %a %2 = zext <4 x i16> %1 to <4 x i32> %3 = add nuw nsw <4 x i32> %2, @@ -2225,31 +2283,31 @@ define void @avg_v8i16_const(<8 x i16>* %a) { ; SSE2-LABEL: avg_v8i16_const: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm0 # [4:0.50] +; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0 # [5:0.50] +; SSE2-NEXT: movdqu %xmm0, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v8i16_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 # [4:0.50] +; AVX2-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v8i16_const: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqu %xmm0, (%rax) -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 # [4:0.50] +; AVX512F-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX512F-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v8i16_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 # [4:0.50] +; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <8 x i16>, <8 x i16>* %a %2 = zext <8 x i16> %1 to <8 x i32> %3 = add nuw nsw <8 x i32> %2, @@ -2262,62 +2320,62 @@ define void @avg_v16i16_const(<16 x i16>* %a) { ; SSE2-LABEL: avg_v16i16_const: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm3 -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,6,7,8] -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2,3,4] -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm0, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm3 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 # [4:0.50] +; SSE2-NEXT: pxor %xmm4, %xmm4 # [1:0.33] +; SSE2-NEXT: movdqa %xmm0, %xmm1 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7][1:0.50] +; SSE2-NEXT: movdqa %xmm3, %xmm2 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7][1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,6,7,8][4:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm3 # [1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2,3,4][4:0.50] +; SSE2-NEXT: paddd %xmm5, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm4, %xmm0 # [1:0.50] +; SSE2-NEXT: paddd %xmm5, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm0 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm3 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm3 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm3 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm2 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm2 # [1:0.50] +; SSE2-NEXT: packssdw %xmm3, %xmm2 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm0 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm0 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm1 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm1 # [1:0.50] +; SSE2-NEXT: packssdw %xmm0, %xmm1 # [1:0.50] +; SSE2-NEXT: movdqu %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm2, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v16i16_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 # [4:0.50] +; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 # [5:0.50] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX2-NEXT: vzeroupper # [?:0.000000e+00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v16i16_const: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 # [4:0.50] +; AVX512F-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 # [5:0.50] +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512F-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v16i16_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 # [4:0.50] +; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 # [5:0.50] +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX512BW-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <16 x i16>, <16 x i16>* %a %2 = zext <16 x i16> %1 to <16 x i32> %3 = add nuw nsw <16 x i32> %2, @@ -2330,97 +2388,97 @@ define void @avg_v32i16_const(<32 x i16>* %a) { ; SSE2-LABEL: avg_v32i16_const: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm7 -; SSE2-NEXT: movdqa 16(%rdi), %xmm6 -; SSE2-NEXT: movdqa 32(%rdi), %xmm4 -; SSE2-NEXT: movdqa 48(%rdi), %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [5,6,7,8] -; SSE2-NEXT: paddd %xmm8, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,2,3,4] -; SSE2-NEXT: paddd %xmm9, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm6 -; SSE2-NEXT: paddd %xmm9, %xmm3 -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: paddd %xmm9, %xmm2 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: pslld $16, %xmm7 -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: packssdw %xmm7, %xmm5 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: packssdw %xmm6, %xmm3 -; SSE2-NEXT: pslld $16, %xmm4 -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm4, %xmm2 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm0, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm3, (%rax) -; SSE2-NEXT: movdqu %xmm5, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa (%rdi), %xmm7 # [4:0.50] +; SSE2-NEXT: movdqa 16(%rdi), %xmm6 # [4:0.50] +; SSE2-NEXT: movdqa 32(%rdi), %xmm4 # [4:0.50] +; SSE2-NEXT: movdqa 48(%rdi), %xmm0 # [4:0.50] +; SSE2-NEXT: pxor %xmm8, %xmm8 # [1:0.33] +; SSE2-NEXT: movdqa %xmm0, %xmm1 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7][1:0.50] +; SSE2-NEXT: movdqa %xmm4, %xmm2 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7][1:0.50] +; SSE2-NEXT: movdqa %xmm6, %xmm3 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7][1:0.50] +; SSE2-NEXT: movdqa %xmm7, %xmm5 # [1:0.33] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3][1:0.50] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7][1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [5,6,7,8][4:0.50] +; SSE2-NEXT: paddd %xmm8, %xmm7 # [1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,2,3,4][4:0.50] +; SSE2-NEXT: paddd %xmm9, %xmm5 # [1:0.50] +; SSE2-NEXT: paddd %xmm8, %xmm6 # [1:0.50] +; SSE2-NEXT: paddd %xmm9, %xmm3 # [1:0.50] +; SSE2-NEXT: paddd %xmm8, %xmm4 # [1:0.50] +; SSE2-NEXT: paddd %xmm9, %xmm2 # [1:0.50] +; SSE2-NEXT: paddd %xmm8, %xmm0 # [1:0.50] +; SSE2-NEXT: paddd %xmm9, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm1 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm0 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm2 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm4 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm3 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm6 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm5 # [1:0.50] +; SSE2-NEXT: psrld $1, %xmm7 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm7 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm7 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm5 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm5 # [1:0.50] +; SSE2-NEXT: packssdw %xmm7, %xmm5 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm6 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm6 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm3 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm3 # [1:0.50] +; SSE2-NEXT: packssdw %xmm6, %xmm3 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm4 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm4 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm2 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm2 # [1:0.50] +; SSE2-NEXT: packssdw %xmm4, %xmm2 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm0 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm0 # [1:0.50] +; SSE2-NEXT: pslld $16, %xmm1 # [1:0.50] +; SSE2-NEXT: psrad $16, %xmm1 # [1:0.50] +; SSE2-NEXT: packssdw %xmm0, %xmm1 # [1:0.50] +; SSE2-NEXT: movdqu %xmm1, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm2, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm3, (%rax) # [1:1.00] +; SSE2-NEXT: movdqu %xmm5, (%rax) # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: avg_v32i16_const: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,3,4,5,6,7,8] -; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vmovdqu %ymm2, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero[5:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,3,4,5,6,7,8][4:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31][4:0.50] +; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3][1:1.00] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 # [1:1.00] +; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3][1:1.00] +; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3][1:1.00] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) # [1:1.00] +; AVX2-NEXT: vmovdqu %ymm2, (%rax) # [1:1.00] +; AVX2-NEXT: vzeroupper # [?:0.000000e+00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512F-LABEL: avg_v32i16_const: ; AVX512F: # BB#0: @@ -2433,16 +2491,16 @@ ; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1 ; AVX512F-NEXT: vpmovdw %zmm1, (%rax) ; AVX512F-NEXT: vpmovdw %zmm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512F-NEXT: retq # [5:1.00] ; ; AVX512BW-LABEL: avg_v32i16_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 # [4:0.50] ; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-NEXT: vzeroupper # [?:0.000000e+00] +; AVX512BW-NEXT: retq # [5:1.00] %1 = load <32 x i16>, <32 x i16>* %a %2 = zext <32 x i16> %1 to <32 x i32> %3 = add nuw nsw <32 x i32> %2, Index: test/CodeGen/X86/avx-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx-intrinsics-x86.ll +++ test/CodeGen/X86/avx-intrinsics-x86.ll @@ -3,10 +3,15 @@ ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL define <4 x double> @test_x86_avx_addsub_pd_256(<4 x double> %a0, <4 x double> %a1) { -; CHECK-LABEL: test_x86_avx_addsub_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd0,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_addsub_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd0,0xc1] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_addsub_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd0,0xc1][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -14,10 +19,15 @@ define <8 x float> @test_x86_avx_addsub_ps_256(<8 x float> %a0, <8 x float> %a1) { -; CHECK-LABEL: test_x86_avx_addsub_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xff,0xd0,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_addsub_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xff,0xd0,0xc1] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_addsub_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xff,0xd0,0xc1][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -25,10 +35,15 @@ define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { -; CHECK-LABEL: test_x86_avx_blendv_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4b,0xc1,0x20] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_blendv_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4b,0xc1,0x20] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_blendv_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4b,0xc1,0x20][2:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -36,10 +51,15 @@ define <8 x float> @test_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { -; CHECK-LABEL: test_x86_avx_blendv_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4a,0xc1,0x20] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_blendv_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4a,0xc1,0x20] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_blendv_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4a,0xc1,0x20][2:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -47,10 +67,15 @@ define <4 x double> @test_x86_avx_cmp_pd_256(<4 x double> %a0, <4 x double> %a1) { -; CHECK-LABEL: test_x86_avx_cmp_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vcmpordpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xc2,0xc1,0x07] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_cmp_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: vcmpordpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xc2,0xc1,0x07] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_cmp_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vcmpordpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xc2,0xc1,0x07][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -58,50 +83,91 @@ define <8 x float> @test_x86_avx_cmp_ps_256(<8 x float> %a0, <8 x float> %a1) { -; CHECK-LABEL: test_x86_avx_cmp_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0xc2,0xc1,0x07] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_cmp_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: vcmpordps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0xc2,0xc1,0x07] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_cmp_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vcmpordps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0xc2,0xc1,0x07][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1] ret <8 x float> %res } define <8 x float> @test_x86_avx_cmp_ps_256_pseudo_op(<8 x float> %a0, <8 x float> %a1) { -; CHECK-LABEL: test_x86_avx_cmp_ps_256_pseudo_op: -; CHECK: ## BB#0: -; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x00] -; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x01] -; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x02] -; CHECK-NEXT: vcmpunordps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x03] -; CHECK-NEXT: vcmpneqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x04] -; CHECK-NEXT: vcmpnltps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x05] -; CHECK-NEXT: vcmpnleps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x06] -; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x07] -; CHECK-NEXT: vcmpeq_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x08] -; CHECK-NEXT: vcmpngeps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x09] -; CHECK-NEXT: vcmpngtps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0a] -; CHECK-NEXT: vcmpfalseps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0b] -; CHECK-NEXT: vcmpneq_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0c] -; CHECK-NEXT: vcmpgeps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0d] -; CHECK-NEXT: vcmpgtps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0e] -; CHECK-NEXT: vcmptrueps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0f] -; CHECK-NEXT: vcmpeq_osps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x10] -; CHECK-NEXT: vcmplt_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x11] -; CHECK-NEXT: vcmple_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x12] -; CHECK-NEXT: vcmpunord_sps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x13] -; CHECK-NEXT: vcmpneq_usps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x14] -; CHECK-NEXT: vcmpnlt_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x15] -; CHECK-NEXT: vcmpnle_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x16] -; CHECK-NEXT: vcmpord_sps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x17] -; CHECK-NEXT: vcmpeq_usps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x18] -; CHECK-NEXT: vcmpnge_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x19] -; CHECK-NEXT: vcmpngt_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1a] -; CHECK-NEXT: vcmpfalse_osps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1b] -; CHECK-NEXT: vcmpneq_osps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1c] -; CHECK-NEXT: vcmpge_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1d] -; CHECK-NEXT: vcmpgt_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1e] -; CHECK-NEXT: vcmptrue_usps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0xc2,0xc1,0x1f] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_cmp_ps_256_pseudo_op: +; AVX: ## BB#0: +; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x00] +; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x01] +; AVX-NEXT: vcmpleps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x02] +; AVX-NEXT: vcmpunordps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x03] +; AVX-NEXT: vcmpneqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x04] +; AVX-NEXT: vcmpnltps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x05] +; AVX-NEXT: vcmpnleps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x06] +; AVX-NEXT: vcmpordps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x07] +; AVX-NEXT: vcmpeq_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x08] +; AVX-NEXT: vcmpngeps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x09] +; AVX-NEXT: vcmpngtps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0a] +; AVX-NEXT: vcmpfalseps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0b] +; AVX-NEXT: vcmpneq_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0c] +; AVX-NEXT: vcmpgeps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0d] +; AVX-NEXT: vcmpgtps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0e] +; AVX-NEXT: vcmptrueps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0f] +; AVX-NEXT: vcmpeq_osps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x10] +; AVX-NEXT: vcmplt_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x11] +; AVX-NEXT: vcmple_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x12] +; AVX-NEXT: vcmpunord_sps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x13] +; AVX-NEXT: vcmpneq_usps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x14] +; AVX-NEXT: vcmpnlt_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x15] +; AVX-NEXT: vcmpnle_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x16] +; AVX-NEXT: vcmpord_sps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x17] +; AVX-NEXT: vcmpeq_usps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x18] +; AVX-NEXT: vcmpnge_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x19] +; AVX-NEXT: vcmpngt_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1a] +; AVX-NEXT: vcmpfalse_osps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1b] +; AVX-NEXT: vcmpneq_osps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1c] +; AVX-NEXT: vcmpge_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1d] +; AVX-NEXT: vcmpgt_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1e] +; AVX-NEXT: vcmptrue_usps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0xc2,0xc1,0x1f] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_cmp_ps_256_pseudo_op: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x00][3:1.00] +; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x01][3:1.00] +; AVX512VL-NEXT: vcmpleps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x02][3:1.00] +; AVX512VL-NEXT: vcmpunordps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x03][3:1.00] +; AVX512VL-NEXT: vcmpneqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x04][3:1.00] +; AVX512VL-NEXT: vcmpnltps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x05][3:1.00] +; AVX512VL-NEXT: vcmpnleps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x06][3:1.00] +; AVX512VL-NEXT: vcmpordps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x07][3:1.00] +; AVX512VL-NEXT: vcmpeq_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x08][3:1.00] +; AVX512VL-NEXT: vcmpngeps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x09][3:1.00] +; AVX512VL-NEXT: vcmpngtps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0a][3:1.00] +; AVX512VL-NEXT: vcmpfalseps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0b][3:1.00] +; AVX512VL-NEXT: vcmpneq_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0c][3:1.00] +; AVX512VL-NEXT: vcmpgeps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0d][3:1.00] +; AVX512VL-NEXT: vcmpgtps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0e][3:1.00] +; AVX512VL-NEXT: vcmptrueps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0f][3:1.00] +; AVX512VL-NEXT: vcmpeq_osps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x10][3:1.00] +; AVX512VL-NEXT: vcmplt_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x11][3:1.00] +; AVX512VL-NEXT: vcmple_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x12][3:1.00] +; AVX512VL-NEXT: vcmpunord_sps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x13][3:1.00] +; AVX512VL-NEXT: vcmpneq_usps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x14][3:1.00] +; AVX512VL-NEXT: vcmpnlt_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x15][3:1.00] +; AVX512VL-NEXT: vcmpnle_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x16][3:1.00] +; AVX512VL-NEXT: vcmpord_sps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x17][3:1.00] +; AVX512VL-NEXT: vcmpeq_usps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x18][3:1.00] +; AVX512VL-NEXT: vcmpnge_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x19][3:1.00] +; AVX512VL-NEXT: vcmpngt_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1a][3:1.00] +; AVX512VL-NEXT: vcmpfalse_osps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1b][3:1.00] +; AVX512VL-NEXT: vcmpneq_osps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1c][3:1.00] +; AVX512VL-NEXT: vcmpge_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1d][3:1.00] +; AVX512VL-NEXT: vcmpgt_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1e][3:1.00] +; AVX512VL-NEXT: vcmptrue_usps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0xc2,0xc1,0x1f][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %a2 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 0) ; <<8 x float>> [#uses=1] %a3 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a2, i8 1) ; <<8 x float>> [#uses=1] %a4 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a3, i8 2) ; <<8 x float>> [#uses=1] @@ -148,9 +214,9 @@ ; ; AVX512VL-LABEL: test_x86_avx_cvt_pd2_ps_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vcvtpd2ps %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xc0] -; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vcvtpd2ps %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xc0][5:1.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -166,9 +232,9 @@ ; ; AVX512VL-LABEL: test_x86_avx_cvt_pd2dq_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vcvtpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xc0] -; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vcvtpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xc0][6:1.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -176,10 +242,15 @@ define <8 x i32> @test_x86_avx_cvt_ps2dq_256(<8 x float> %a0) { -; CHECK-LABEL: test_x86_avx_cvt_ps2dq_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x5b,0xc0] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_cvt_ps2dq_256: +; AVX: ## BB#0: +; AVX-NEXT: vcvtps2dq %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x5b,0xc0] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_cvt_ps2dq_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vcvtps2dq %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x5b,0xc0][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -194,8 +265,8 @@ ; ; AVX512VL-LABEL: test_x86_avx_cvtdq2_ps_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vcvtdq2ps %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5b,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vcvtdq2ps %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5b,0xc0][4:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32> %a0) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -211,9 +282,9 @@ ; ; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0] -; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0][6:1.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -228,8 +299,8 @@ ; ; AVX512VL-LABEL: test_x86_avx_cvtt_ps2dq_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xc0][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -237,10 +308,15 @@ define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) { -; CHECK-LABEL: test_x86_avx_dp_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x40,0xc1,0x07] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_dp_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x40,0xc1,0x07] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_dp_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x40,0xc1,0x07][14:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -248,10 +324,15 @@ define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1) { -; CHECK-LABEL: test_x86_avx_hadd_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x7c,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_hadd_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x7c,0xc1] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_hadd_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x7c,0xc1][5:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -259,10 +340,15 @@ define <8 x float> @test_x86_avx_hadd_ps_256(<8 x float> %a0, <8 x float> %a1) { -; CHECK-LABEL: test_x86_avx_hadd_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xff,0x7c,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_hadd_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xff,0x7c,0xc1] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_hadd_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xff,0x7c,0xc1][5:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -270,10 +356,15 @@ define <4 x double> @test_x86_avx_hsub_pd_256(<4 x double> %a0, <4 x double> %a1) { -; CHECK-LABEL: test_x86_avx_hsub_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x7d,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_hsub_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x7d,0xc1] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_hsub_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x7d,0xc1][5:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -281,10 +372,15 @@ define <8 x float> @test_x86_avx_hsub_ps_256(<8 x float> %a0, <8 x float> %a1) { -; CHECK-LABEL: test_x86_avx_hsub_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vhsubps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xff,0x7d,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_hsub_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xff,0x7d,0xc1] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_hsub_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vhsubps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xff,0x7d,0xc1][5:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -292,11 +388,17 @@ define <32 x i8> @test_x86_avx_ldu_dq_256(i8* %a0) { -; CHECK-LABEL: test_x86_avx_ldu_dq_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vlddqu (%eax), %ymm0 ## encoding: [0xc5,0xff,0xf0,0x00] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_ldu_dq_256: +; AVX: ## BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX-NEXT: vlddqu (%eax), %ymm0 ## encoding: [0xc5,0xff,0xf0,0x00] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_ldu_dq_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vlddqu (%eax), %ymm0 ## encoding: [0xc5,0xff,0xf0,0x00][4:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -304,11 +406,17 @@ define <2 x double> @test_x86_avx_maskload_pd(i8* %a0, <2 x i64> %mask) { -; CHECK-LABEL: test_x86_avx_maskload_pd: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x2d,0x00] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_maskload_pd: +; AVX: ## BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x2d,0x00] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_maskload_pd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x2d,0x00][4:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1] ret <2 x double> %res } @@ -316,11 +424,17 @@ define <4 x double> @test_x86_avx_maskload_pd_256(i8* %a0, <4 x i64> %mask) { -; CHECK-LABEL: test_x86_avx_maskload_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2d,0x00] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_maskload_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2d,0x00] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_maskload_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2d,0x00][4:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -328,11 +442,17 @@ define <4 x float> @test_x86_avx_maskload_ps(i8* %a0, <4 x i32> %mask) { -; CHECK-LABEL: test_x86_avx_maskload_ps: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vmaskmovps (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x2c,0x00] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_maskload_ps: +; AVX: ## BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX-NEXT: vmaskmovps (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x2c,0x00] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_maskload_ps: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vmaskmovps (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x2c,0x00][4:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -340,11 +460,17 @@ define <8 x float> @test_x86_avx_maskload_ps_256(i8* %a0, <8 x i32> %mask) { -; CHECK-LABEL: test_x86_avx_maskload_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vmaskmovps (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2c,0x00] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_maskload_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX-NEXT: vmaskmovps (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2c,0x00] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_maskload_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vmaskmovps (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2c,0x00][4:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -352,11 +478,17 @@ define void @test_x86_avx_maskstore_pd(i8* %a0, <2 x i64> %mask, <2 x double> %a2) { -; CHECK-LABEL: test_x86_avx_maskstore_pd: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0x79,0x2f,0x08] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_maskstore_pd: +; AVX: ## BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0x79,0x2f,0x08] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_maskstore_pd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0x79,0x2f,0x08][13:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %mask, <2 x double> %a2) ret void } @@ -364,12 +496,19 @@ define void @test_x86_avx_maskstore_pd_256(i8* %a0, <4 x i64> %mask, <4 x double> %a2) { -; CHECK-LABEL: test_x86_avx_maskstore_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_maskstore_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08] +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_maskstore_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08][14:1.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %mask, <4 x double> %a2) ret void } @@ -377,11 +516,17 @@ define void @test_x86_avx_maskstore_ps(i8* %a0, <4 x i32> %mask, <4 x float> %a2) { -; CHECK-LABEL: test_x86_avx_maskstore_ps: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vmaskmovps %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0x79,0x2e,0x08] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_maskstore_ps: +; AVX: ## BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX-NEXT: vmaskmovps %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0x79,0x2e,0x08] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_maskstore_ps: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vmaskmovps %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0x79,0x2e,0x08][13:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %mask, <4 x float> %a2) ret void } @@ -389,12 +534,19 @@ define void @test_x86_avx_maskstore_ps_256(i8* %a0, <8 x i32> %mask, <8 x float> %a2) { -; CHECK-LABEL: test_x86_avx_maskstore_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_maskstore_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08] +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_maskstore_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08][14:1.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %mask, <8 x float> %a2) ret void } @@ -409,8 +561,8 @@ ; ; AVX512VL-LABEL: test_x86_avx_max_pd_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5f,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5f,0xc1][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -425,8 +577,8 @@ ; ; AVX512VL-LABEL: test_x86_avx_max_ps_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5f,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5f,0xc1][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -441,8 +593,8 @@ ; ; AVX512VL-LABEL: test_x86_avx_min_pd_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vminpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5d,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vminpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5d,0xc1][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -457,8 +609,8 @@ ; ; AVX512VL-LABEL: test_x86_avx_min_ps_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vminps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5d,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vminps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5d,0xc1][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -466,11 +618,17 @@ define i32 @test_x86_avx_movmsk_pd_256(<4 x double> %a0) { -; CHECK-LABEL: test_x86_avx_movmsk_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_movmsk_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0] +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_movmsk_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0][2:1.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) ; [#uses=1] ret i32 %res } @@ -478,11 +636,17 @@ define i32 @test_x86_avx_movmsk_ps_256(<8 x float> %a0) { -; CHECK-LABEL: test_x86_avx_movmsk_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_movmsk_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0] +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_movmsk_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0][2:1.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) ; [#uses=1] ret i32 %res } @@ -495,13 +659,21 @@ define i32 @test_x86_avx_ptestc_256(<4 x i64> %a0, <4 x i64> %a1) { -; CHECK-LABEL: test_x86_avx_ptestc_256: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] -; CHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_ptestc_256: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] +; AVX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_ptestc_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1][2:1.00] +; AVX512VL-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0][1:0.50] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1) ; [#uses=1] ret i32 %res } @@ -509,13 +681,21 @@ define i32 @test_x86_avx_ptestnzc_256(<4 x i64> %a0, <4 x i64> %a1) { -; CHECK-LABEL: test_x86_avx_ptestnzc_256: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] -; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_ptestnzc_256: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] +; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_ptestnzc_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1][2:1.00] +; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0][1:0.50] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1) ; [#uses=1] ret i32 %res } @@ -523,13 +703,21 @@ define i32 @test_x86_avx_ptestz_256(<4 x i64> %a0, <4 x i64> %a1) { -; CHECK-LABEL: test_x86_avx_ptestz_256: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] -; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_ptestz_256: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] +; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_ptestz_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1][2:1.00] +; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0][1:0.50] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1) ; [#uses=1] ret i32 %res } @@ -545,7 +733,7 @@ ; AVX512VL-LABEL: test_x86_avx_rcp_ps_256: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: vrcp14ps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x4c,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -553,10 +741,15 @@ define <4 x double> @test_x86_avx_round_pd_256(<4 x double> %a0) { -; CHECK-LABEL: test_x86_avx_round_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vroundpd $7, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x09,0xc0,0x07] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_round_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: vroundpd $7, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x09,0xc0,0x07] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_round_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vroundpd $7, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x09,0xc0,0x07][6:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -564,10 +757,15 @@ define <8 x float> @test_x86_avx_round_ps_256(<8 x float> %a0) { -; CHECK-LABEL: test_x86_avx_round_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vroundps $7, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x08,0xc0,0x07] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_round_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: vroundps $7, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x08,0xc0,0x07] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_round_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vroundps $7, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x08,0xc0,0x07][6:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -583,7 +781,7 @@ ; AVX512VL-LABEL: test_x86_avx_rsqrt_ps_256: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: vrsqrt14ps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x4e,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -591,10 +789,15 @@ define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) { -; CHECK-LABEL: test_x86_avx_sqrt_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x51,0xc0] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_sqrt_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: vsqrtpd %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x51,0xc0] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vsqrtpd %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x51,0xc0][28:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -602,10 +805,15 @@ define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) { -; CHECK-LABEL: test_x86_avx_sqrt_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vsqrtps %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0x51,0xc0] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_sqrt_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: vsqrtps %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0x51,0xc0] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0x51,0xc0][19:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -613,11 +821,15 @@ define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) { -; CHECK-LABEL: test_x86_avx_vperm2f128_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vperm2f128 $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x07] -; CHECK-NEXT: ## ymm0 = ymm1[2,3],ymm0[0,1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vperm2f128_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: vperm2f128 $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x07]ymm0 = ymm1[2,3],ymm0[0,1] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vperm2f128_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vperm2f128 $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x07]ymm0 = ymm1[2,3],ymm0[0,1][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -625,11 +837,15 @@ define <8 x float> @test_x86_avx_vperm2f128_ps_256(<8 x float> %a0, <8 x float> %a1) { -; CHECK-LABEL: test_x86_avx_vperm2f128_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vperm2f128 $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x07] -; CHECK-NEXT: ## ymm0 = ymm1[2,3],ymm0[0,1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vperm2f128_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: vperm2f128 $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x07]ymm0 = ymm1[2,3],ymm0[0,1] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vperm2f128_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vperm2f128 $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x07]ymm0 = ymm1[2,3],ymm0[0,1][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -637,11 +853,15 @@ define <8 x i32> @test_x86_avx_vperm2f128_si_256(<8 x i32> %a0, <8 x i32> %a1) { -; CHECK-LABEL: test_x86_avx_vperm2f128_si_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vperm2f128 $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x07] -; CHECK-NEXT: ## ymm0 = ymm1[2,3],ymm0[0,1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vperm2f128_si_256: +; AVX: ## BB#0: +; AVX-NEXT: vperm2f128 $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x07]ymm0 = ymm1[2,3],ymm0[0,1] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vperm2f128_si_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vperm2f128 $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x07]ymm0 = ymm1[2,3],ymm0[0,1][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -656,8 +876,8 @@ ; ; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0d,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0d,0xc1][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) ; <<2 x double>> [#uses=1] ret <2 x double> %res } @@ -672,8 +892,8 @@ ; ; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0d,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0d,0xc1][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -682,15 +902,13 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) { ; AVX-LABEL: test_x86_avx_vpermilvar_pd_256_2: ; AVX: ## BB#0: -; AVX-NEXT: vpermilpd $9, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x09] -; AVX-NEXT: ## ymm0 = ymm0[1,0,2,3] +; AVX-NEXT: vpermilpd $9, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x09]ymm0 = ymm0[1,0,2,3] ; AVX-NEXT: retl ## encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256_2: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpermilpd $9, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x09] -; AVX512VL-NEXT: ## ymm0 = ymm0[1,0,2,3] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpermilpd $9, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x09]ymm0 = ymm0[1,0,2,3][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> ) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -703,8 +921,8 @@ ; ; AVX512VL-LABEL: test_x86_avx_vpermilvar_ps: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0xc1][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -717,9 +935,9 @@ ; ; AVX512VL-LABEL: test_x86_avx_vpermilvar_ps_load: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vpermilps (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0x00] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpermilps (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0x00][5:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %a2 = load <4 x i32>, <4 x i32>* %a1 %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a2) ; <<4 x float>> [#uses=1] ret <4 x float> %res @@ -735,8 +953,8 @@ ; ; AVX512VL-LABEL: test_x86_avx_vpermilvar_ps_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0c,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0c,0xc1][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -744,12 +962,19 @@ define i32 @test_x86_avx_vtestc_pd(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: test_x86_avx_vtestc_pd: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vtestpd %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0f,0xc1] -; CHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vtestc_pd: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vtestpd %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0f,0xc1] +; AVX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vtestc_pd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vtestpd %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0f,0xc1][1:0.33] +; AVX512VL-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] ret i32 %res } @@ -757,13 +982,21 @@ define i32 @test_x86_avx_vtestc_pd_256(<4 x double> %a0, <4 x double> %a1) { -; CHECK-LABEL: test_x86_avx_vtestc_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] -; CHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vtestc_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] +; AVX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vtestc_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1][1:0.33] +; AVX512VL-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0][1:0.50] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) ; [#uses=1] ret i32 %res } @@ -771,12 +1004,19 @@ define i32 @test_x86_avx_vtestc_ps(<4 x float> %a0, <4 x float> %a1) { -; CHECK-LABEL: test_x86_avx_vtestc_ps: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vtestps %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0e,0xc1] -; CHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vtestc_ps: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vtestps %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0e,0xc1] +; AVX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vtestc_ps: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vtestps %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0e,0xc1][1:0.33] +; AVX512VL-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) ; [#uses=1] ret i32 %res } @@ -784,13 +1024,21 @@ define i32 @test_x86_avx_vtestc_ps_256(<8 x float> %a0, <8 x float> %a1) { -; CHECK-LABEL: test_x86_avx_vtestc_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] -; CHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vtestc_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] +; AVX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vtestc_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1][1:0.33] +; AVX512VL-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0][1:0.50] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) ; [#uses=1] ret i32 %res } @@ -798,12 +1046,19 @@ define i32 @test_x86_avx_vtestnzc_pd(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: test_x86_avx_vtestnzc_pd: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vtestpd %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0f,0xc1] -; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vtestnzc_pd: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vtestpd %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0f,0xc1] +; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vtestnzc_pd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vtestpd %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0f,0xc1][1:0.33] +; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] ret i32 %res } @@ -811,13 +1066,21 @@ define i32 @test_x86_avx_vtestnzc_pd_256(<4 x double> %a0, <4 x double> %a1) { -; CHECK-LABEL: test_x86_avx_vtestnzc_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] -; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vtestnzc_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] +; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vtestnzc_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1][1:0.33] +; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0][1:0.50] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1) ; [#uses=1] ret i32 %res } @@ -825,12 +1088,19 @@ define i32 @test_x86_avx_vtestnzc_ps(<4 x float> %a0, <4 x float> %a1) { -; CHECK-LABEL: test_x86_avx_vtestnzc_ps: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vtestps %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0e,0xc1] -; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vtestnzc_ps: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vtestps %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0e,0xc1] +; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vtestnzc_ps: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vtestps %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0e,0xc1][1:0.33] +; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1) ; [#uses=1] ret i32 %res } @@ -838,13 +1108,21 @@ define i32 @test_x86_avx_vtestnzc_ps_256(<8 x float> %a0, <8 x float> %a1) { -; CHECK-LABEL: test_x86_avx_vtestnzc_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] -; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vtestnzc_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] +; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vtestnzc_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1][1:0.33] +; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0][1:0.50] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1) ; [#uses=1] ret i32 %res } @@ -852,12 +1130,19 @@ define i32 @test_x86_avx_vtestz_pd(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: test_x86_avx_vtestz_pd: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vtestpd %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0f,0xc1] -; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vtestz_pd: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vtestpd %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0f,0xc1] +; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vtestz_pd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vtestpd %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0f,0xc1][1:0.33] +; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] ret i32 %res } @@ -865,13 +1150,21 @@ define i32 @test_x86_avx_vtestz_pd_256(<4 x double> %a0, <4 x double> %a1) { -; CHECK-LABEL: test_x86_avx_vtestz_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] -; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vtestz_pd_256: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] +; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vtestz_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1][1:0.33] +; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0][1:0.50] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1) ; [#uses=1] ret i32 %res } @@ -879,12 +1172,19 @@ define i32 @test_x86_avx_vtestz_ps(<4 x float> %a0, <4 x float> %a1) { -; CHECK-LABEL: test_x86_avx_vtestz_ps: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vtestps %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0e,0xc1] -; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vtestz_ps: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vtestps %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0e,0xc1] +; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vtestz_ps: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vtestps %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0e,0xc1][1:0.33] +; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1) ; [#uses=1] ret i32 %res } @@ -892,13 +1192,21 @@ define i32 @test_x86_avx_vtestz_ps_256(<8 x float> %a0, <8 x float> %a1) { -; CHECK-LABEL: test_x86_avx_vtestz_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] -; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vtestz_ps_256: +; AVX: ## BB#0: +; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] +; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vtestz_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; AVX512VL-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1][1:0.33] +; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0][1:0.50] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1) ; [#uses=1] ret i32 %res } @@ -906,10 +1214,15 @@ define void @test_x86_avx_vzeroall() { -; CHECK-LABEL: test_x86_avx_vzeroall: -; CHECK: ## BB#0: -; CHECK-NEXT: vzeroall ## encoding: [0xc5,0xfc,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vzeroall: +; AVX: ## BB#0: +; AVX-NEXT: vzeroall ## encoding: [0xc5,0xfc,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vzeroall: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vzeroall ## encoding: [0xc5,0xfc,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx.vzeroall() ret void } @@ -917,10 +1230,15 @@ define void @test_x86_avx_vzeroupper() { -; CHECK-LABEL: test_x86_avx_vzeroupper: -; CHECK: ## BB#0: -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vzeroupper: +; AVX: ## BB#0: +; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vzeroupper: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx.vzeroupper() ret void } @@ -938,12 +1256,13 @@ ; ; AVX512VL-LABEL: movnt_dq: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] ; AVX512VL-NEXT: vpaddq LCPI65_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0x05,A,A,A,A] ; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI65_0, kind: FK_Data_4 -; AVX512VL-NEXT: vmovntdq %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00] -; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: ## [5:0.50] +; AVX512VL-NEXT: vmovntdq %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00][1:1.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %a2 = add <2 x i64> %a1, %a3 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> tail call void @llvm.x86.avx.movnt.dq.256(i8* %p, <4 x i64> %a3) nounwind @@ -961,10 +1280,10 @@ ; ; AVX512VL-LABEL: movnt_ps: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vmovntps %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x00] -; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vmovntps %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x00][1:1.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind ret void } @@ -983,12 +1302,12 @@ ; ; AVX512VL-LABEL: movnt_pd: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x57,0xc9] -; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] -; AVX512VL-NEXT: vmovntpd %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x00] -; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x57,0xc9][1:1.00] +; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1][3:1.00] +; AVX512VL-NEXT: vmovntpd %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x00][1:1.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %a2 = fadd <4 x double> %a1, tail call void @llvm.x86.avx.movnt.pd.256(i8* %p, <4 x double> %a2) nounwind ret void @@ -998,10 +1317,15 @@ ; Check for pclmulqdq define <2 x i64> @test_x86_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) { -; CHECK-LABEL: test_x86_pclmulqdq: -; CHECK: ## BB#0: -; CHECK-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x44,0xc1,0x00] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_pclmulqdq: +; AVX: ## BB#0: +; AVX-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x44,0xc1,0x00] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_pclmulqdq: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x44,0xc1,0x00][7:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } Index: test/CodeGen/X86/avx-intrinsics-x86_64.ll =================================================================== --- test/CodeGen/X86/avx-intrinsics-x86_64.ll +++ test/CodeGen/X86/avx-intrinsics-x86_64.ll @@ -5,18 +5,20 @@ define <4 x double> @test_x86_avx_vzeroall(<4 x double> %a, <4 x double> %b) { ; AVX-LABEL: test_x86_avx_vzeroall: ; AVX: ## BB#0: -; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovupd %ymm0, -{{[0-9]+}}(%rsp) ## 32-byte Spill -; AVX-NEXT: vzeroall -; AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %ymm0 ## 32-byte Reload -; AVX-NEXT: retq +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## [3:1.00] +; AVX-NEXT: vmovupd %ymm0, -{{[0-9]+}}(%rsp) ## 32-byte Spill [1:1.00] +; AVX-NEXT: ## [1:1.00] +; AVX-NEXT: vzeroall ## [?:0.000000e+00] +; AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %ymm0 ## 32-byte Reload [4:0.50] +; AVX-NEXT: ## [4:0.50] +; AVX-NEXT: retq ## [5:1.00] ; ; AVX512VL-LABEL: test_x86_avx_vzeroall: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm16 -; AVX512VL-NEXT: vzeroall +; AVX512VL-NEXT: vzeroall ## [?:0.000000e+00] ; AVX512VL-NEXT: vmovapd %ymm16, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq ## [5:1.00] %c = fadd <4 x double> %a, %b call void @llvm.x86.avx.vzeroall() ret <4 x double> %c @@ -26,18 +28,20 @@ define <4 x double> @test_x86_avx_vzeroupper(<4 x double> %a, <4 x double> %b) { ; AVX-LABEL: test_x86_avx_vzeroupper: ; AVX: ## BB#0: -; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovupd %ymm0, -{{[0-9]+}}(%rsp) ## 32-byte Spill -; AVX-NEXT: vzeroupper -; AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %ymm0 ## 32-byte Reload -; AVX-NEXT: retq +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## [3:1.00] +; AVX-NEXT: vmovupd %ymm0, -{{[0-9]+}}(%rsp) ## 32-byte Spill [1:1.00] +; AVX-NEXT: ## [1:1.00] +; AVX-NEXT: vzeroupper ## [?:0.000000e+00] +; AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %ymm0 ## 32-byte Reload [4:0.50] +; AVX-NEXT: ## [4:0.50] +; AVX-NEXT: retq ## [5:1.00] ; ; AVX512VL-LABEL: test_x86_avx_vzeroupper: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm16 -; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: vzeroupper ## [?:0.000000e+00] ; AVX512VL-NEXT: vmovapd %ymm16, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq ## [5:1.00] %c = fadd <4 x double> %a, %b call void @llvm.x86.avx.vzeroupper() ret <4 x double> %c Index: test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86.ll +++ test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -10,8 +10,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_packssdw: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -26,8 +26,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_packsswb: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -42,8 +42,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_packuswb: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -58,8 +58,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_padds_b: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -74,8 +74,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_padds_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -90,8 +90,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_paddus_b: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -106,8 +106,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_paddus_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -122,8 +122,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pavg_b: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -138,8 +138,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pavg_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -154,8 +154,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmadd_wd: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1][5:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -170,8 +170,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmaxs_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -186,8 +186,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmaxu_b: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -202,8 +202,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmins_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -218,8 +218,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pminu_b: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpminub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpminub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -227,11 +227,17 @@ define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) { -; CHECK-LABEL: test_x86_avx2_pmovmskb: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pmovmskb: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0] +; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pmovmskb: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0][3:1.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; [#uses=1] ret i32 %res } @@ -246,8 +252,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmulh_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1][5:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -262,8 +268,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmulhu_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1][5:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -278,8 +284,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmulu_dq: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0xc1][5:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -294,8 +300,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psad_bw: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1][5:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -310,8 +316,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psll_d: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xc1][2:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -326,8 +332,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psll_q: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xc1][2:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -342,8 +348,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psll_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xc1][2:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -358,8 +364,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pslli_d: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpslld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xf0,0x07] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpslld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xf0,0x07][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -374,8 +380,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pslli_q: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xf0,0x07] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xf0,0x07][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -390,8 +396,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pslli_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x07] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x07][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -406,8 +412,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psra_d: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xc1][2:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -422,8 +428,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psra_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xc1][2:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -438,8 +444,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrai_d: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsrad $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xe0,0x07] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsrad $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xe0,0x07][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -454,8 +460,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrai_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsraw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x07] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsraw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x07][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -470,8 +476,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrl_d: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xc1][2:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -486,8 +492,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrl_q: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xc1][2:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -502,8 +508,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrl_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1][2:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -518,8 +524,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrli_d: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsrld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x07] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsrld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x07][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -534,8 +540,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrli_q: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xd0,0x07] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xd0,0x07][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -550,8 +556,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrli_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x07] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x07][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -566,8 +572,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psubs_b: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -582,8 +588,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psubs_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -598,8 +604,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psubus_b: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -614,8 +620,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psubus_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -630,8 +636,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pabs_b: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpabsb %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1c,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpabsb %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1c,0xc0][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -646,8 +652,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pabs_d: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpabsd %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1e,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpabsd %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1e,0xc0][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -662,8 +668,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pabs_w: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpabsw %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1d,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpabsw %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1d,0xc0][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -671,10 +677,15 @@ define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) { -; CHECK-LABEL: test_x86_avx2_phadd_d: -; CHECK: ## BB#0: -; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x02,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_phadd_d: +; AVX2: ## BB#0: +; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x02,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_phadd_d: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x02,0xc1][3:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -682,10 +693,15 @@ define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_x86_avx2_phadd_sw: -; CHECK: ## BB#0: -; CHECK-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x03,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_phadd_sw: +; AVX2: ## BB#0: +; AVX2-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x03,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_phadd_sw: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x03,0xc1][3:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -693,10 +709,15 @@ define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_x86_avx2_phadd_w: -; CHECK: ## BB#0: -; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x01,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_phadd_w: +; AVX2: ## BB#0: +; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x01,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_phadd_w: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vphaddw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x01,0xc1][3:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -704,10 +725,15 @@ define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) { -; CHECK-LABEL: test_x86_avx2_phsub_d: -; CHECK: ## BB#0: -; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x06,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_phsub_d: +; AVX2: ## BB#0: +; AVX2-NEXT: vphsubd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x06,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_phsub_d: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vphsubd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x06,0xc1][3:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -715,10 +741,15 @@ define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_x86_avx2_phsub_sw: -; CHECK: ## BB#0: -; CHECK-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x07,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_phsub_sw: +; AVX2: ## BB#0: +; AVX2-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x07,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_phsub_sw: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x07,0xc1][3:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -726,10 +757,15 @@ define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_x86_avx2_phsub_w: -; CHECK: ## BB#0: -; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x05,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_phsub_w: +; AVX2: ## BB#0: +; AVX2-NEXT: vphsubw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x05,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_phsub_w: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vphsubw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x05,0xc1][3:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -744,8 +780,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1][5:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -762,10 +798,10 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw_load_op0: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vmovdqu (%eax), %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x08] -; AVX512VL-NEXT: vpmaddubsw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x04,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vmovdqu (%eax), %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x08][4:0.50] +; AVX512VL-NEXT: vpmaddubsw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x04,0xc0][5:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %a0 = load <32 x i8>, <32 x i8>* %ptr %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res @@ -779,8 +815,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmul_hr_sw: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1][5:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -795,8 +831,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pshuf_b: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i8>> [#uses=1] ret <32 x i8> %res } @@ -804,10 +840,15 @@ define <32 x i8> @test_x86_avx2_psign_b(<32 x i8> %a0, <32 x i8> %a1) { -; CHECK-LABEL: test_x86_avx2_psign_b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpsignb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x08,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psign_b: +; AVX2: ## BB#0: +; AVX2-NEXT: vpsignb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x08,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psign_b: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpsignb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x08,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -815,10 +856,15 @@ define <8 x i32> @test_x86_avx2_psign_d(<8 x i32> %a0, <8 x i32> %a1) { -; CHECK-LABEL: test_x86_avx2_psign_d: -; CHECK: ## BB#0: -; CHECK-NEXT: vpsignd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0a,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psign_d: +; AVX2: ## BB#0: +; AVX2-NEXT: vpsignd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0a,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psign_d: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpsignd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0a,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <8 x i32> %res } @@ -826,10 +872,15 @@ define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_x86_avx2_psign_w: -; CHECK: ## BB#0: -; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psign_w: +; AVX2: ## BB#0: +; AVX2-NEXT: vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psign_w: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -845,9 +896,9 @@ ; ; AVX512VL-LABEL: test_x86_avx2_movntdqa: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vmovntdqa (%eax), %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2a,0x00] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vmovntdqa (%eax), %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2a,0x00][4:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -855,10 +906,15 @@ define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { -; CHECK-LABEL: test_x86_avx2_mpsadbw: -; CHECK: ## BB#0: -; CHECK-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_mpsadbw: +; AVX2: ## BB#0: +; AVX2-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_mpsadbw: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07][6:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -873,8 +929,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_packusdw: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -882,10 +938,15 @@ define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) { -; CHECK-LABEL: test_x86_avx2_pblendvb: -; CHECK: ## BB#0: -; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4c,0xc1,0x20] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pblendvb: +; AVX2: ## BB#0: +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4c,0xc1,0x20] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pblendvb: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4c,0xc1,0x20][2:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -893,11 +954,15 @@ define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_x86_avx2_pblendw: -; CHECK: ## BB#0: -; CHECK-NEXT: vpblendw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0e,0xc1,0x07] -; CHECK-NEXT: ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pblendw: +; AVX2: ## BB#0: +; AVX2-NEXT: vpblendw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0e,0xc1,0x07]ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pblendw: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpblendw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0e,0xc1,0x07]ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -912,8 +977,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmaxsb: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -928,8 +993,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmaxsd: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -944,8 +1009,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmaxud: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -960,8 +1025,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pmaxuw: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3e,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3e,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -976,8 +1041,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pminsb: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -992,8 +1057,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pminsd: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1008,8 +1073,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pminud: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpminud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpminud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1024,8 +1089,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_pminuw: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3a,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3a,0xc1][1:0.50] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -1040,11 +1105,15 @@ define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: test_x86_avx2_pblendd_128: -; CHECK: ## BB#0: -; CHECK-NEXT: vpblendd $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x02,0xc0,0x08] -; CHECK-NEXT: ## xmm0 = xmm1[0,1,2],xmm0[3] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pblendd_128: +; AVX2: ## BB#0: +; AVX2-NEXT: vpblendd $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x02,0xc0,0x08]xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pblendd_128: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpblendd $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x02,0xc0,0x08]xmm0 = xmm1[0,1,2],xmm0[3][1:0.33] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -1052,11 +1121,15 @@ define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) { -; CHECK-LABEL: test_x86_avx2_pblendd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: vpblendd $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0x07] -; CHECK-NEXT: ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pblendd_256: +; AVX2: ## BB#0: +; AVX2-NEXT: vpblendd $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0x07]ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pblendd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpblendd $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0x07]ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7][1:0.33] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1074,8 +1147,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_permd: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xc0][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1093,8 +1166,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_permps: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -1102,11 +1175,15 @@ define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) { -; CHECK-LABEL: test_x86_avx2_vperm2i128: -; CHECK: ## BB#0: -; CHECK-NEXT: vperm2f128 $1, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x01] -; CHECK-NEXT: ## ymm0 = ymm0[2,3,0,1] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_vperm2i128: +; AVX2: ## BB#0: +; AVX2-NEXT: vperm2f128 $1, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x01]ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_vperm2i128: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vperm2f128 $1, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x01]ymm0 = ymm0[2,3,0,1][3:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -1114,11 +1191,17 @@ define <2 x i64> @test_x86_avx2_maskload_q(i8* %a0, <2 x i64> %a1) { -; CHECK-LABEL: test_x86_avx2_maskload_q: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x8c,0x00] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_maskload_q: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x8c,0x00] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_maskload_q: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x8c,0x00][4:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -1126,11 +1209,17 @@ define <4 x i64> @test_x86_avx2_maskload_q_256(i8* %a0, <4 x i64> %a1) { -; CHECK-LABEL: test_x86_avx2_maskload_q_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x8c,0x00] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_maskload_q_256: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x8c,0x00] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_maskload_q_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x8c,0x00][4:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -1138,11 +1227,17 @@ define <4 x i32> @test_x86_avx2_maskload_d(i8* %a0, <4 x i32> %a1) { -; CHECK-LABEL: test_x86_avx2_maskload_d: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x8c,0x00] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_maskload_d: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x8c,0x00] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_maskload_d: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x8c,0x00][4:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -1150,11 +1245,17 @@ define <8 x i32> @test_x86_avx2_maskload_d_256(i8* %a0, <8 x i32> %a1) { -; CHECK-LABEL: test_x86_avx2_maskload_d_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x8c,0x00] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_maskload_d_256: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x8c,0x00] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_maskload_d_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x8c,0x00][4:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1162,11 +1263,17 @@ define void @test_x86_avx2_maskstore_q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) { -; CHECK-LABEL: test_x86_avx2_maskstore_q: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0xf9,0x8e,0x08] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_maskstore_q: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0xf9,0x8e,0x08] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_maskstore_q: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0xf9,0x8e,0x08][13:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx2.maskstore.q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) ret void } @@ -1174,12 +1281,19 @@ define void @test_x86_avx2_maskstore_q_256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) { -; CHECK-LABEL: test_x86_avx2_maskstore_q_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_maskstore_q_256: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08] +; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_maskstore_q_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08][13:1.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx2.maskstore.q.256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) ret void } @@ -1187,11 +1301,17 @@ define void @test_x86_avx2_maskstore_d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) { -; CHECK-LABEL: test_x86_avx2_maskstore_d: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0x79,0x8e,0x08] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_maskstore_d: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0x79,0x8e,0x08] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_maskstore_d: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0x79,0x8e,0x08][13:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx2.maskstore.d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) ret void } @@ -1199,12 +1319,19 @@ define void @test_x86_avx2_maskstore_d_256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) { -; CHECK-LABEL: test_x86_avx2_maskstore_d_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_maskstore_d_256: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08] +; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_maskstore_d_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08][13:1.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx2.maskstore.d.256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) ret void } @@ -1219,8 +1346,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psllv_d: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xc1][2:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -1235,8 +1362,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psllv_d_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xc1][2:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1251,8 +1378,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psllv_q: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xc1][2:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -1267,8 +1394,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psllv_q_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xc1][2:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -1283,8 +1410,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrlv_d: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xc1][2:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -1299,8 +1426,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrlv_d_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xc1][2:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1315,8 +1442,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrlv_q: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xc1][2:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -1331,8 +1458,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrlv_q_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xc1][2:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -1347,8 +1474,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrav_d: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xc1][2:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -1356,8 +1483,7 @@ define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) { ; AVX2-LABEL: test_x86_avx2_psrav_d_const: ; AVX2: ## BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] -; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI91_0, kind: FK_Data_4 ; AVX2-NEXT: vpsravd LCPI91_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] ; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI91_1, kind: FK_Data_4 @@ -1365,12 +1491,13 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vmovdqa LCPI91_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] -; AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; AVX512VL-NEXT: vmovdqa LCPI91_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI91_0, kind: FK_Data_4 +; AVX512VL-NEXT: ## [4:0.50] ; AVX512VL-NEXT: vpsravd LCPI91_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] ; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI91_1, kind: FK_Data_4 -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: ## [6:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> , <4 x i32> ) ret <4 x i32> %res } @@ -1384,8 +1511,8 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrav_d_256: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xc1] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xc1][2:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1393,8 +1520,7 @@ define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1) { ; AVX2-LABEL: test_x86_avx2_psrav_d_256_const: ; AVX2: ## BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; AVX2-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI93_0, kind: FK_Data_4 ; AVX2-NEXT: vpsravd LCPI93_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI93_1, kind: FK_Data_4 @@ -1402,23 +1528,30 @@ ; ; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vmovdqa LCPI93_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; AVX512VL-NEXT: vmovdqa LCPI93_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI93_0, kind: FK_Data_4 +; AVX512VL-NEXT: ## [4:0.50] ; AVX512VL-NEXT: vpsravd LCPI93_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI93_1, kind: FK_Data_4 -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: ## [6:2.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> , <8 x i32> ) ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1, <4 x i32> %idx, <2 x double> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_d_pd: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x92,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_d_pd: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x92,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_d_pd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x92,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ; ret <2 x double> %res @@ -1427,11 +1560,17 @@ <4 x i32>, <2 x double>, i8) nounwind readonly define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1, <4 x i32> %idx, <4 x double> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_d_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x92,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_d_pd_256: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x92,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_d_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x92,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %a1, <4 x i32> %idx, <4 x double> %mask, i8 2) ; ret <4 x double> %res @@ -1440,11 +1579,17 @@ <4 x i32>, <4 x double>, i8) nounwind readonly define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1, <2 x i64> %idx, <2 x double> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_q_pd: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x93,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_q_pd: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x93,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_q_pd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x93,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %a1, <2 x i64> %idx, <2 x double> %mask, i8 2) ; ret <2 x double> %res @@ -1453,11 +1598,17 @@ <2 x i64>, <2 x double>, i8) nounwind readonly define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1, <4 x i64> %idx, <4 x double> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_q_pd_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x93,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_q_pd_256: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x93,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_q_pd_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x93,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %a1, <4 x i64> %idx, <4 x double> %mask, i8 2) ; ret <4 x double> %res @@ -1466,11 +1617,17 @@ <4 x i64>, <4 x double>, i8) nounwind readonly define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1, <4 x i32> %idx, <4 x float> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_d_ps: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x92,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_d_ps: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x92,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_d_ps: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x92,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ; ret <4 x float> %res @@ -1479,11 +1636,17 @@ <4 x i32>, <4 x float>, i8) nounwind readonly define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1, <8 x i32> %idx, <8 x float> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_d_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0x6d,0x92,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_d_ps_256: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0x6d,0x92,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_d_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0x6d,0x92,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %a1, <8 x i32> %idx, <8 x float> %mask, i8 2) ; ret <8 x float> %res @@ -1492,11 +1655,17 @@ <8 x i32>, <8 x float>, i8) nounwind readonly define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1, <2 x i64> %idx, <4 x float> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_q_ps: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x93,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_q_ps: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x93,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_q_ps: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x93,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %a1, <2 x i64> %idx, <4 x float> %mask, i8 2) ; ret <4 x float> %res @@ -1505,12 +1674,19 @@ <2 x i64>, <4 x float>, i8) nounwind readonly define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1, <4 x i64> %idx, <4 x float> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_q_ps_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_q_ps_256: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48] +; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_q_ps_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48][1:0.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %a1, <4 x i64> %idx, <4 x float> %mask, i8 2) ; ret <4 x float> %res @@ -1519,11 +1695,17 @@ <4 x i64>, <4 x float>, i8) nounwind readonly define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1, <4 x i32> %idx, <2 x i64> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_d_q: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x90,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_d_q: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x90,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_d_q: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x90,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %a1, <4 x i32> %idx, <2 x i64> %mask, i8 2) ; ret <2 x i64> %res @@ -1532,11 +1714,17 @@ <4 x i32>, <2 x i64>, i8) nounwind readonly define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1, <4 x i32> %idx, <4 x i64> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_d_q_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x90,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_d_q_256: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x90,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_d_q_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x90,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %a1, <4 x i32> %idx, <4 x i64> %mask, i8 2) ; ret <4 x i64> %res @@ -1545,11 +1733,17 @@ <4 x i32>, <4 x i64>, i8) nounwind readonly define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1, <2 x i64> %idx, <2 x i64> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_q_q: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x91,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_q_q: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x91,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_q_q: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x91,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %a1, <2 x i64> %idx, <2 x i64> %mask, i8 2) ; ret <2 x i64> %res @@ -1558,11 +1752,17 @@ <2 x i64>, <2 x i64>, i8) nounwind readonly define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1, <4 x i64> %idx, <4 x i64> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_q_q_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x91,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_q_q_256: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x91,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_q_q_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x91,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %a1, <4 x i64> %idx, <4 x i64> %mask, i8 2) ; ret <4 x i64> %res @@ -1571,11 +1771,17 @@ <4 x i64>, <4 x i64>, i8) nounwind readonly define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1, <4 x i32> %idx, <4 x i32> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_d_d: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x90,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_d_d: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x90,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_d_d: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x90,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %a0, i8* %a1, <4 x i32> %idx, <4 x i32> %mask, i8 2) ; ret <4 x i32> %res @@ -1584,11 +1790,17 @@ <4 x i32>, <4 x i32>, i8) nounwind readonly define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1, <8 x i32> %idx, <8 x i32> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_d_d_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0x6d,0x90,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_d_d_256: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0x6d,0x90,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_d_d_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0x6d,0x90,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %a0, i8* %a1, <8 x i32> %idx, <8 x i32> %mask, i8 2) ; ret <8 x i32> %res @@ -1597,11 +1809,17 @@ <8 x i32>, <8 x i32>, i8) nounwind readonly define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1, <2 x i64> %idx, <4 x i32> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_q_d: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x91,0x04,0x48] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_q_d: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x91,0x04,0x48] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_q_d: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x91,0x04,0x48][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %a0, i8* %a1, <2 x i64> %idx, <4 x i32> %mask, i8 2) ; ret <4 x i32> %res @@ -1610,12 +1828,19 @@ <2 x i64>, <4 x i32>, i8) nounwind readonly define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1, <4 x i64> %idx, <4 x i32> %mask) { -; CHECK-LABEL: test_x86_avx2_gather_q_d_256: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; CHECK-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_gather_q_d_256: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48] +; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_gather_q_d_256: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48][1:0.00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77][1:0.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0, i8* %a1, <4 x i64> %idx, <4 x i32> %mask, i8 2) ; ret <4 x i32> %res @@ -1637,12 +1862,12 @@ ; ; AVX512VL-LABEL: test_gather_mask: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda] -; AVX512VL-NEXT: vgatherdps %ymm3, (%eax,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x88] -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] -; AVX512VL-NEXT: vmovups %ymm2, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x10] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; AVX512VL-NEXT: vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda][1:1.00] +; AVX512VL-NEXT: vgatherdps %ymm3, (%eax,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x88][1:0.00] +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08][4:0.50] +; AVX512VL-NEXT: vmovups %ymm2, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x10][1:1.00] +; AVX512VL-NEXT: retl ## encoding: [0xc3][1:1.00] %a_i8 = bitcast float* %a to i8* %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %a_i8, <8 x i32> %idx, <8 x float> %mask, i8 4) ; Index: test/CodeGen/X86/avx512-bugfix-23634.ll =================================================================== --- test/CodeGen/X86/avx512-bugfix-23634.ll +++ test/CodeGen/X86/avx512-bugfix-23634.ll @@ -7,20 +7,21 @@ define void @f_fu(float* %ret, float* %aa, float %b) { ; CHECK-LABEL: f_fu: ; CHECK: ## BB#0: ## %allocas -; CHECK-NEXT: vcvttss2si %xmm0, %eax +; CHECK-NEXT: vcvttss2si %xmm0, %eax ## [4:1.00] ; CHECK-NEXT: vpbroadcastd %eax, %zmm0 ; CHECK-NEXT: vcvttps2dq (%rsi), %zmm1 ; CHECK-NEXT: vpsrld $31, %zmm0, %zmm2 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm2 ; CHECK-NEXT: vpsrad $1, %zmm2, %zmm2 ; CHECK-NEXT: movw $-21846, %ax ## imm = 0xAAAA +; CHECK-NEXT: ## [1:0.25] ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa32 {{.*}}(%rip), %zmm1 {%k1} +; CHECK-NEXT: vmovdqa32 {{.*}}(%rip), %zmm1 {%k1} ## [4:0.50] ; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 ; CHECK-NEXT: vmovups %zmm0, (%rdi) -; CHECK-NEXT: retq +; CHECK-NEXT: retq ## [1:1.00] allocas: %ptr_cast_for_load = bitcast float* %aa to <16 x float>* %ptr_masked_load.39 = load <16 x float>, <16 x float>* %ptr_cast_for_load, align 4 Index: test/CodeGen/X86/avx512-bugfix-25270.ll =================================================================== --- test/CodeGen/X86/avx512-bugfix-25270.ll +++ test/CodeGen/X86/avx512-bugfix-25270.ll @@ -6,21 +6,22 @@ define void @bar__512(<16 x i32>* %var) #0 { ; CHECK-LABEL: bar__512: ; CHECK: ## BB#0: ## %allocas -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $112, %rsp -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: vmovups (%rbx), %zmm0 -; CHECK-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill +; CHECK-NEXT: pushq %rbx ## [1:1.00] +; CHECK-NEXT: subq $112, %rsp ## [1:0.25] +; CHECK-NEXT: movq %rdi, %rbx ## [1:0.25] +; CHECK-NEXT: vmovups (%rbx), %zmm0 ## [4:0.50] +; CHECK-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill [1:0.00] ; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %zmm1 ; CHECK-NEXT: vmovaps %zmm1, (%rbx) ; CHECK-NEXT: callq _Print__512 -; CHECK-NEXT: vmovups (%rsp), %zmm0 ## 64-byte Reload +; CHECK-NEXT: vmovups (%rsp), %zmm0 ## 64-byte Reload [4:0.50] +; CHECK-NEXT: ## [4:0.50] ; CHECK-NEXT: callq _Print__512 ; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %zmm0 ; CHECK-NEXT: vmovaps %zmm0, (%rbx) -; CHECK-NEXT: addq $112, %rsp -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: retq +; CHECK-NEXT: addq $112, %rsp ## [1:0.25] +; CHECK-NEXT: popq %rbx ## [4:0.50] +; CHECK-NEXT: retq ## [1:1.00] allocas: %var_load_load = load <16 x i32>, <16 x i32>* %var, align 1 store <16 x i32> , <16 x i32>* %var, align 64 Index: test/CodeGen/X86/avx512-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-cmp.ll +++ test/CodeGen/X86/avx512-cmp.ll @@ -5,15 +5,15 @@ define double @test1(double %a, double %b) nounwind { ; ALL-LABEL: test1: ; ALL: ## BB#0: -; ALL-NEXT: vucomisd %xmm1, %xmm0 -; ALL-NEXT: jne LBB0_1 -; ALL-NEXT: jnp LBB0_2 +; ALL-NEXT: vucomisd %xmm1, %xmm0 ## [3:1.00] +; ALL-NEXT: jne LBB0_1 ## [1:0.50] +; ALL-NEXT: jnp LBB0_2 ## [1:0.50] ; ALL-NEXT: LBB0_1: ## %l1 -; ALL-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: retq +; ALL-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ## [3:1.00] +; ALL-NEXT: retq ## [1:1.00] ; ALL-NEXT: LBB0_2: ## %l2 -; ALL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: retq +; ALL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ## [3:1.00] +; ALL-NEXT: retq ## [1:1.00] %tobool = fcmp une double %a, %b br i1 %tobool, label %l1, label %l2 @@ -28,14 +28,14 @@ define float @test2(float %a, float %b) nounwind { ; ALL-LABEL: test2: ; ALL: ## BB#0: -; ALL-NEXT: vucomiss %xmm0, %xmm1 -; ALL-NEXT: jbe LBB1_2 +; ALL-NEXT: vucomiss %xmm0, %xmm1 ## [3:1.00] +; ALL-NEXT: jbe LBB1_2 ## [1:0.50] ; ALL-NEXT: ## BB#1: ## %l1 -; ALL-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; ALL-NEXT: retq +; ALL-NEXT: vsubss %xmm1, %xmm0, %xmm0 ## [3:1.00] +; ALL-NEXT: retq ## [1:1.00] ; ALL-NEXT: LBB1_2: ## %l2 -; ALL-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; ALL-NEXT: retq +; ALL-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## [3:1.00] +; ALL-NEXT: retq ## [1:1.00] %tobool = fcmp olt float %a, %b br i1 %tobool, label %l1, label %l2 @@ -51,12 +51,12 @@ define i32 @test3(float %a, float %b) { ; ALL-LABEL: test3: ; ALL: ## BB#0: -; ALL-NEXT: vucomiss %xmm1, %xmm0 -; ALL-NEXT: setnp %al -; ALL-NEXT: sete %cl -; ALL-NEXT: andb %al, %cl -; ALL-NEXT: movzbl %cl, %eax -; ALL-NEXT: retq +; ALL-NEXT: vucomiss %xmm1, %xmm0 ## [3:1.00] +; ALL-NEXT: setnp %al ## [1:0.50] +; ALL-NEXT: sete %cl ## [1:0.50] +; ALL-NEXT: andb %al, %cl ## [1:0.25] +; ALL-NEXT: movzbl %cl, %eax ## [1:0.25] +; ALL-NEXT: retq ## [1:1.00] %cmp10.i = fcmp oeq float %a, %b %conv11.i = zext i1 %cmp10.i to i32 @@ -66,18 +66,18 @@ define float @test5(float %p) #0 { ; ALL-LABEL: test5: ; ALL: ## BB#0: ## %entry -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vucomiss %xmm1, %xmm0 -; ALL-NEXT: jne LBB3_1 -; ALL-NEXT: jp LBB3_1 +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## [1:1.00] +; ALL-NEXT: vucomiss %xmm1, %xmm0 ## [3:1.00] +; ALL-NEXT: jne LBB3_1 ## [1:0.50] +; ALL-NEXT: jp LBB3_1 ## [1:0.50] ; ALL-NEXT: ## BB#2: ## %return -; ALL-NEXT: retq +; ALL-NEXT: retq ## [1:1.00] ; ALL-NEXT: LBB3_1: ## %if.end -; ALL-NEXT: seta %al -; ALL-NEXT: movzbl %al, %eax -; ALL-NEXT: leaq {{.*}}(%rip), %rcx -; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: retq +; ALL-NEXT: seta %al ## [1:0.50] +; ALL-NEXT: movzbl %al, %eax ## [1:0.25] +; ALL-NEXT: leaq {{.*}}(%rip), %rcx ## [1:0.50] +; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; ALL-NEXT: retq ## [1:1.00] entry: %cmp = fcmp oeq float %p, 0.000000e+00 br i1 %cmp, label %return, label %if.end @@ -95,10 +95,10 @@ define i32 @test6(i32 %a, i32 %b) { ; ALL-LABEL: test6: ; ALL: ## BB#0: -; ALL-NEXT: xorl %eax, %eax -; ALL-NEXT: cmpl %esi, %edi -; ALL-NEXT: sete %al -; ALL-NEXT: retq +; ALL-NEXT: xorl %eax, %eax ## [1:0.25] +; ALL-NEXT: cmpl %esi, %edi ## [1:0.25] +; ALL-NEXT: sete %al ## [1:0.50] +; ALL-NEXT: retq ## [1:1.00] %cmp = icmp eq i32 %a, %b %res = zext i1 %cmp to i32 ret i32 %res @@ -107,10 +107,10 @@ define i32 @test7(double %x, double %y) #2 { ; ALL-LABEL: test7: ; ALL: ## BB#0: ## %entry -; ALL-NEXT: xorl %eax, %eax -; ALL-NEXT: vucomisd %xmm1, %xmm0 -; ALL-NEXT: setne %al -; ALL-NEXT: retq +; ALL-NEXT: xorl %eax, %eax ## [1:0.25] +; ALL-NEXT: vucomisd %xmm1, %xmm0 ## [3:1.00] +; ALL-NEXT: setne %al ## [1:0.50] +; ALL-NEXT: retq ## [1:1.00] entry: %0 = fcmp one double %x, %y %or = zext i1 %0 to i32 @@ -120,14 +120,15 @@ define i32 @test8(i32 %a1, i32 %a2, i32 %a3) { ; ALL-LABEL: test8: ; ALL: ## BB#0: -; ALL-NEXT: testl %edx, %edx -; ALL-NEXT: movl $1, %eax -; ALL-NEXT: cmovel %eax, %edx +; ALL-NEXT: testl %edx, %edx ## [1:0.25] +; ALL-NEXT: movl $1, %eax ## [1:0.25] +; ALL-NEXT: cmovel %eax, %edx ## [2:0.50] ; ALL-NEXT: cmpl $-2147483648, %esi ## imm = 0x80000000 -; ALL-NEXT: cmovnel %edx, %eax -; ALL-NEXT: cmpl $-1, %edi -; ALL-NEXT: cmovnel %edx, %eax -; ALL-NEXT: retq +; ALL-NEXT: ## [1:0.25] +; ALL-NEXT: cmovnel %edx, %eax ## [2:0.50] +; ALL-NEXT: cmpl $-1, %edi ## [1:0.25] +; ALL-NEXT: cmovnel %edx, %eax ## [2:0.50] +; ALL-NEXT: retq ## [1:1.00] %tmp1 = icmp eq i32 %a1, -1 %tmp2 = icmp eq i32 %a2, -2147483648 %tmp3 = and i1 %tmp1, %tmp2 @@ -140,14 +141,14 @@ define i32 @test9(i64 %a) { ; ALL-LABEL: test9: ; ALL: ## BB#0: -; ALL-NEXT: testb $1, %dil -; ALL-NEXT: jne LBB7_2 +; ALL-NEXT: testb $1, %dil ## [1:0.25] +; ALL-NEXT: jne LBB7_2 ## [1:0.50] ; ALL-NEXT: ## BB#1: ## %A -; ALL-NEXT: movl $6, %eax -; ALL-NEXT: retq +; ALL-NEXT: movl $6, %eax ## [1:0.25] +; ALL-NEXT: retq ## [1:1.00] ; ALL-NEXT: LBB7_2: ## %B -; ALL-NEXT: movl $7, %eax -; ALL-NEXT: retq +; ALL-NEXT: movl $7, %eax ## [1:0.25] +; ALL-NEXT: retq ## [1:1.00] %b = and i64 %a, 1 %cmp10.i = icmp eq i64 %b, 0 br i1 %cmp10.i, label %A, label %B @@ -160,45 +161,45 @@ define i32 @test10(i64 %b, i64 %c, i1 %d) { ; KNL-LABEL: test10: ; KNL: ## BB#0: -; KNL-NEXT: andl $1, %edx +; KNL-NEXT: andl $1, %edx ## [1:0.25] ; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: cmpq %rsi, %rdi -; KNL-NEXT: sete %al -; KNL-NEXT: andl $1, %eax +; KNL-NEXT: cmpq %rsi, %rdi ## [1:0.25] +; KNL-NEXT: sete %al ## [1:0.50] +; KNL-NEXT: andl $1, %eax ## [1:0.25] ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: korw %k1, %k0, %k1 ; KNL-NEXT: kxorw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: testb %al, %al -; KNL-NEXT: je LBB8_1 +; KNL-NEXT: andl $1, %eax ## [1:0.25] +; KNL-NEXT: testb %al, %al ## [1:0.25] +; KNL-NEXT: je LBB8_1 ## [1:0.50] ; KNL-NEXT: ## BB#2: ## %if.end.i -; KNL-NEXT: movl $6, %eax -; KNL-NEXT: retq +; KNL-NEXT: movl $6, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; KNL-NEXT: LBB8_1: ## %if.then.i -; KNL-NEXT: movl $5, %eax -; KNL-NEXT: retq +; KNL-NEXT: movl $5, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test10: ; SKX: ## BB#0: -; SKX-NEXT: andl $1, %edx +; SKX-NEXT: andl $1, %edx ## [1:0.25] ; SKX-NEXT: kmovd %edx, %k0 -; SKX-NEXT: cmpq %rsi, %rdi -; SKX-NEXT: sete %al -; SKX-NEXT: andl $1, %eax +; SKX-NEXT: cmpq %rsi, %rdi ## [1:0.25] +; SKX-NEXT: sete %al ## [1:0.50] +; SKX-NEXT: andl $1, %eax ## [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: korw %k1, %k0, %k1 ; SKX-NEXT: kxorw %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: testb %al, %al -; SKX-NEXT: je LBB8_1 +; SKX-NEXT: andl $1, %eax ## [1:0.25] +; SKX-NEXT: testb %al, %al ## [1:0.25] +; SKX-NEXT: je LBB8_1 ## [1:0.50] ; SKX-NEXT: ## BB#2: ## %if.end.i -; SKX-NEXT: movl $6, %eax -; SKX-NEXT: retq +; SKX-NEXT: movl $6, %eax ## [1:0.25] +; SKX-NEXT: retq ## [1:1.00] ; SKX-NEXT: LBB8_1: ## %if.then.i -; SKX-NEXT: movl $5, %eax -; SKX-NEXT: retq +; SKX-NEXT: movl $5, %eax ## [1:0.25] +; SKX-NEXT: retq ## [1:1.00] %cmp8.i = icmp eq i64 %b, %c %or1 = or i1 %d, %cmp8.i Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -12,7 +12,7 @@ ; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AX %AX %EAX -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: mask16: ; SKX: ## BB#0: @@ -20,7 +20,7 @@ ; SKX-NEXT: knotw %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: mask16: ; AVX512BW: ## BB#0: @@ -49,14 +49,14 @@ ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: mask16_zext: ; SKX: ## BB#0: ; SKX-NEXT: kmovd %edi, %k0 ; SKX-NEXT: knotw %k0, %k0 ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: mask16_zext: ; AVX512BW: ## BB#0: @@ -85,7 +85,7 @@ ; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AL %AL %EAX -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: mask8: ; SKX: ## BB#0: @@ -93,7 +93,7 @@ ; SKX-NEXT: knotb %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: mask8: ; AVX512BW: ## BB#0: @@ -122,15 +122,15 @@ ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movzbl %al, %eax -; KNL-NEXT: retq +; KNL-NEXT: movzbl %al, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: mask8_zext: ; SKX: ## BB#0: ; SKX-NEXT: kmovd %edi, %k0 ; SKX-NEXT: knotb %k0, %k0 ; SKX-NEXT: kmovb %k0, %eax -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: mask8_zext: ; AVX512BW: ## BB#0: @@ -154,12 +154,33 @@ } define void @mask16_mem(i16* %ptr) { -; CHECK-LABEL: mask16_mem: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw (%rdi), %k0 -; CHECK-NEXT: knotw %k0, %k0 -; CHECK-NEXT: kmovw %k0, (%rdi) -; CHECK-NEXT: retq +; KNL-LABEL: mask16_mem: +; KNL: ## BB#0: +; KNL-NEXT: kmovw (%rdi), %k0 +; KNL-NEXT: knotw %k0, %k0 +; KNL-NEXT: kmovw %k0, (%rdi) +; KNL-NEXT: retq ## [1:1.00] +; +; SKX-LABEL: mask16_mem: +; SKX: ## BB#0: +; SKX-NEXT: kmovw (%rdi), %k0 +; SKX-NEXT: knotw %k0, %k0 +; SKX-NEXT: kmovw %k0, (%rdi) +; SKX-NEXT: retq ## [1:1.00] +; +; AVX512BW-LABEL: mask16_mem: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: knotw %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, (%rdi) +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: mask16_mem: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: knotw %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rdi) +; AVX512DQ-NEXT: retq %x = load i16, i16* %ptr, align 4 %m0 = bitcast i16 %x to <16 x i1> %m1 = xor <16 x i1> %m0, @@ -171,19 +192,19 @@ define void @mask8_mem(i8* %ptr) { ; KNL-LABEL: mask8_mem: ; KNL: ## BB#0: -; KNL-NEXT: movzbl (%rdi), %eax +; KNL-NEXT: movzbl (%rdi), %eax ## [4:0.50] ; KNL-NEXT: kmovw %eax, %k0 ; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: retq +; KNL-NEXT: movb %al, (%rdi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: mask8_mem: ; SKX: ## BB#0: ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: knotb %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: mask8_mem: ; AVX512BW: ## BB#0: @@ -209,14 +230,41 @@ } define i16 @mand16(i16 %x, i16 %y) { -; CHECK-LABEL: mand16: -; CHECK: ## BB#0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorl %esi, %eax -; CHECK-NEXT: andl %esi, %edi -; CHECK-NEXT: orl %eax, %edi -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: retq +; KNL-LABEL: mand16: +; KNL: ## BB#0: +; KNL-NEXT: movl %edi, %eax ## [1:0.25] +; KNL-NEXT: xorl %esi, %eax ## [1:0.25] +; KNL-NEXT: andl %esi, %edi ## [1:0.25] +; KNL-NEXT: orl %eax, %edi ## [1:0.25] +; KNL-NEXT: movl %edi, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] +; +; SKX-LABEL: mand16: +; SKX: ## BB#0: +; SKX-NEXT: movl %edi, %eax ## [1:0.25] +; SKX-NEXT: xorl %esi, %eax ## [1:0.25] +; SKX-NEXT: andl %esi, %edi ## [1:0.25] +; SKX-NEXT: orl %eax, %edi ## [1:0.25] +; SKX-NEXT: movl %edi, %eax ## [1:0.25] +; SKX-NEXT: retq ## [1:1.00] +; +; AVX512BW-LABEL: mand16: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: movl %edi, %eax +; AVX512BW-NEXT: xorl %esi, %eax +; AVX512BW-NEXT: andl %esi, %edi +; AVX512BW-NEXT: orl %eax, %edi +; AVX512BW-NEXT: movl %edi, %eax +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: mand16: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: movl %edi, %eax +; AVX512DQ-NEXT: xorl %esi, %eax +; AVX512DQ-NEXT: andl %esi, %edi +; AVX512DQ-NEXT: orl %eax, %edi +; AVX512DQ-NEXT: movl %edi, %eax +; AVX512DQ-NEXT: retq %ma = bitcast i16 %x to <16 x i1> %mb = bitcast i16 %y to <16 x i1> %mc = and <16 x i1> %ma, %mb @@ -236,7 +284,7 @@ ; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AX %AX %EAX -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: mand16_mem: ; SKX: ## BB#0: @@ -247,7 +295,7 @@ ; SKX-NEXT: korw %k0, %k2, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: mand16_mem: ; AVX512BW: ## BB#0: @@ -286,7 +334,7 @@ ; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AL %AL %EAX -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: shuf_test1: ; SKX: ## BB#0: @@ -294,7 +342,7 @@ ; SKX-NEXT: kshiftrw $8, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: shuf_test1: ; AVX512BW: ## BB#0: @@ -320,22 +368,22 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { ; KNL-LABEL: zext_test1: ; KNL: ## BB#0: -; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## [?:0.000000e+00] ; KNL-NEXT: kshiftlw $10, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: retq +; KNL-NEXT: andl $1, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: zext_test1: ; SKX: ## BB#0: -; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## [?:0.000000e+00] ; SKX-NEXT: kshiftlw $10, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: andl $1, %eax ## [1:0.25] +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: zext_test1: ; AVX512BW: ## BB#0: @@ -365,24 +413,24 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { ; KNL-LABEL: zext_test2: ; KNL: ## BB#0: -; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## [?:0.000000e+00] ; KNL-NEXT: kshiftlw $10, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax +; KNL-NEXT: andl $1, %eax ## [1:0.25] ; KNL-NEXT: ## kill: %AX %AX %EAX -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: zext_test2: ; SKX: ## BB#0: -; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## [?:0.000000e+00] ; SKX-NEXT: kshiftlw $10, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax +; SKX-NEXT: andl $1, %eax ## [1:0.25] ; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: zext_test2: ; AVX512BW: ## BB#0: @@ -414,24 +462,24 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { ; KNL-LABEL: zext_test3: ; KNL: ## BB#0: -; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## [?:0.000000e+00] ; KNL-NEXT: kshiftlw $10, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax +; KNL-NEXT: andl $1, %eax ## [1:0.25] ; KNL-NEXT: ## kill: %AL %AL %EAX -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: zext_test3: ; SKX: ## BB#0: -; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## [?:0.000000e+00] ; SKX-NEXT: kshiftlw $10, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax +; SKX-NEXT: andl $1, %eax ## [1:0.25] ; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: zext_test3: ; AVX512BW: ## BB#0: @@ -465,18 +513,18 @@ ; KNL: ## BB#0: ## %entry ; KNL-NEXT: kxnorw %k0, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: movb $-2, -{{[0-9]+}}(%rsp) -; KNL-NEXT: movb $-2, %al -; KNL-NEXT: retq +; KNL-NEXT: movb %al, (%rdi) ## [1:1.00] +; KNL-NEXT: movb $-2, -{{[0-9]+}}(%rsp) ## [1:1.00] +; KNL-NEXT: movb $-2, %al ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: conv1: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: kxnorw %k0, %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: movb $-2, -{{[0-9]+}}(%rsp) -; SKX-NEXT: movb $-2, %al -; SKX-NEXT: retq +; SKX-NEXT: movb $-2, -{{[0-9]+}}(%rsp) ## [1:1.00] +; SKX-NEXT: movb $-2, %al ## [1:0.25] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: conv1: ; AVX512BW: ## BB#0: ## %entry @@ -507,21 +555,21 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) { ; KNL-LABEL: test4: ; KNL: ## BB#0: -; KNL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ## [5:1.00] ; KNL-NEXT: vpmovqd %zmm0, %ymm0 -; KNL-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm1 +; KNL-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm1 ## [5:1.00] ; KNL-NEXT: vpmovqd %zmm1, %ymm1 -; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ## [1:0.50] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test4: ; SKX: ## BB#0: -; SKX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 -; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 +; SKX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 ## [?:0.000000e+00] +; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 ## [?:0.000000e+00] ; SKX-NEXT: kandnw %k0, %k1, %k0 ; SKX-NEXT: vpmovm2d %k0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test4: ; AVX512BW: ## BB#0: @@ -552,18 +600,18 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) { ; KNL-LABEL: test5: ; KNL: ## BB#0: -; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ## [5:1.00] +; KNL-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 ## [5:1.00] +; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ## [5:1.00] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test5: ; SKX: ## BB#0: -; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 -; SKX-NEXT: vpcmpgtq %xmm3, %xmm2, %k1 +; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 ## [?:0.000000e+00] +; SKX-NEXT: vpcmpgtq %xmm3, %xmm2, %k1 ## [?:0.000000e+00] ; SKX-NEXT: kandnw %k1, %k0, %k0 ; SKX-NEXT: vpmovm2q %k0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test5: ; AVX512BW: ## BB#0: @@ -602,22 +650,22 @@ ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: movb $85, %al +; KNL-NEXT: movb $85, %al ## [1:0.25] ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb %al, %al -; KNL-NEXT: retq +; KNL-NEXT: testb %al, %al ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test7: ; SKX: ## BB#0: ## %allocas -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: vpmovw2m %xmm0, %k0 -; SKX-NEXT: movb $85, %al +; SKX-NEXT: movb $85, %al ## [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: ktestb %k0, %k0 -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test7: ; AVX512BW: ## BB#0: ## %allocas @@ -657,34 +705,34 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { ; KNL-LABEL: test8: ; KNL: ## BB#0: -; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; KNL-NEXT: cmpl %esi, %edi -; KNL-NEXT: jg LBB17_1 +; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## [?:0.000000e+00] +; KNL-NEXT: cmpl %esi, %edi ## [1:0.25] +; KNL-NEXT: jg LBB17_1 ## [1:0.50] ; KNL-NEXT: ## BB#2: -; KNL-NEXT: vpcmpltud %zmm2, %zmm1, %k1 -; KNL-NEXT: jmp LBB17_3 +; KNL-NEXT: vpcmpltud %zmm2, %zmm1, %k1 ## [?:0.000000e+00] +; KNL-NEXT: jmp LBB17_3 ## [1:0.50] ; KNL-NEXT: LBB17_1: -; KNL-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 +; KNL-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 ## [?:0.000000e+00] ; KNL-NEXT: LBB17_3: ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test8: ; SKX: ## BB#0: -; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; SKX-NEXT: cmpl %esi, %edi -; SKX-NEXT: jg LBB17_1 +; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## [?:0.000000e+00] +; SKX-NEXT: cmpl %esi, %edi ## [1:0.25] +; SKX-NEXT: jg LBB17_1 ## [1:0.50] ; SKX-NEXT: ## BB#2: -; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0 +; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0 ## [?:0.000000e+00] ; SKX-NEXT: vpmovm2b %k0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; SKX-NEXT: LBB17_1: -; SKX-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 +; SKX-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 ## [?:0.000000e+00] ; SKX-NEXT: vpmovm2b %k0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test8: ; AVX512BW: ## BB#0: @@ -727,11 +775,11 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) { ; KNL-LABEL: test9: ; KNL: ## BB#0: -; KNL-NEXT: cmpl %esi, %edi -; KNL-NEXT: jg LBB18_1 +; KNL-NEXT: cmpl %esi, %edi ## [1:0.25] +; KNL-NEXT: jg LBB18_1 ## [1:0.50] ; KNL-NEXT: ## BB#2: ; KNL-NEXT: vpmovsxbd %xmm1, %zmm0 -; KNL-NEXT: jmp LBB18_3 +; KNL-NEXT: jmp LBB18_3 ## [1:0.50] ; KNL-NEXT: LBB18_1: ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: LBB18_3: @@ -739,21 +787,21 @@ ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test9: ; SKX: ## BB#0: -; SKX-NEXT: cmpl %esi, %edi -; SKX-NEXT: jg LBB18_1 +; SKX-NEXT: cmpl %esi, %edi ## [1:0.25] +; SKX-NEXT: jg LBB18_1 ## [1:0.50] ; SKX-NEXT: ## BB#2: -; SKX-NEXT: vpsllw $7, %xmm1, %xmm0 -; SKX-NEXT: jmp LBB18_3 +; SKX-NEXT: vpsllw $7, %xmm1, %xmm0 ## [1:1.00] +; SKX-NEXT: jmp LBB18_3 ## [1:0.50] ; SKX-NEXT: LBB18_1: -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: LBB18_3: ; SKX-NEXT: vpmovb2m %xmm0, %k0 ; SKX-NEXT: vpmovm2b %k0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test9: ; AVX512BW: ## BB#0: @@ -799,26 +847,26 @@ define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) { ; KNL-LABEL: test11: ; KNL: ## BB#0: -; KNL-NEXT: cmpl %esi, %edi -; KNL-NEXT: jg LBB20_2 +; KNL-NEXT: cmpl %esi, %edi ## [1:0.25] +; KNL-NEXT: jg LBB20_2 ## [1:0.50] ; KNL-NEXT: ## BB#1: -; KNL-NEXT: vmovaps %xmm1, %xmm0 +; KNL-NEXT: vmovaps %xmm1, %xmm0 ## [1:1.00] ; KNL-NEXT: LBB20_2: -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test11: ; SKX: ## BB#0: -; SKX-NEXT: cmpl %esi, %edi -; SKX-NEXT: jg LBB20_1 +; SKX-NEXT: cmpl %esi, %edi ## [1:0.25] +; SKX-NEXT: jg LBB20_1 ## [1:0.50] ; SKX-NEXT: ## BB#2: -; SKX-NEXT: vpslld $31, %xmm1, %xmm0 -; SKX-NEXT: jmp LBB20_3 +; SKX-NEXT: vpslld $31, %xmm1, %xmm0 ## [1:1.00] +; SKX-NEXT: jmp LBB20_3 ## [1:0.50] ; SKX-NEXT: LBB20_1: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: LBB20_3: ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 ; SKX-NEXT: vpmovm2d %k0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test11: ; AVX512BW: ## BB#0: @@ -843,10 +891,25 @@ } define i32 @test12(i32 %x, i32 %y) { -; CHECK-LABEL: test12: -; CHECK: ## BB#0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: retq +; KNL-LABEL: test12: +; KNL: ## BB#0: +; KNL-NEXT: movl %edi, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] +; +; SKX-LABEL: test12: +; SKX: ## BB#0: +; SKX-NEXT: movl %edi, %eax ## [1:0.25] +; SKX-NEXT: retq ## [1:1.00] +; +; AVX512BW-LABEL: test12: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: movl %edi, %eax +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test12: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: movl %edi, %eax +; AVX512DQ-NEXT: retq %a = bitcast i16 21845 to <16 x i1> %b = extractelement <16 x i1> %a, i32 0 %c = select i1 %b, i32 %x, i32 %y @@ -854,10 +917,25 @@ } define i32 @test13(i32 %x, i32 %y) { -; CHECK-LABEL: test13: -; CHECK: ## BB#0: -; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: retq +; KNL-LABEL: test13: +; KNL: ## BB#0: +; KNL-NEXT: movl %esi, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] +; +; SKX-LABEL: test13: +; SKX: ## BB#0: +; SKX-NEXT: movl %esi, %eax ## [1:0.25] +; SKX-NEXT: retq ## [1:1.00] +; +; AVX512BW-LABEL: test13: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: movl %esi, %eax +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test13: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: movl %esi, %eax +; AVX512DQ-NEXT: retq %a = bitcast i16 21845 to <16 x i1> %b = extractelement <16 x i1> %a, i32 3 %c = select i1 %b, i32 %x, i32 %y @@ -872,24 +950,26 @@ define <16 x i1> @test15(i32 %x, i32 %y) { ; KNL-LABEL: test15: ; KNL: ## BB#0: -; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: cmpl %esi, %edi ## [1:0.25] ; KNL-NEXT: movw $21845, %ax ## imm = 0x5555 -; KNL-NEXT: movw $1, %cx -; KNL-NEXT: cmovgw %ax, %cx +; KNL-NEXT: ## [1:0.25] +; KNL-NEXT: movw $1, %cx ## [1:0.25] +; KNL-NEXT: cmovgw %ax, %cx ## [2:0.50] ; KNL-NEXT: kmovw %ecx, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test15: ; SKX: ## BB#0: -; SKX-NEXT: cmpl %esi, %edi +; SKX-NEXT: cmpl %esi, %edi ## [1:0.25] ; SKX-NEXT: movw $21845, %ax ## imm = 0x5555 -; SKX-NEXT: movw $1, %cx -; SKX-NEXT: cmovgw %ax, %cx +; SKX-NEXT: ## [1:0.25] +; SKX-NEXT: movw $1, %cx ## [1:0.25] +; SKX-NEXT: cmovgw %ax, %cx ## [2:0.50] ; SKX-NEXT: kmovd %ecx, %k0 ; SKX-NEXT: vpmovm2b %k0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test15: ; AVX512BW: ## BB#0: @@ -925,42 +1005,42 @@ ; ; KNL-LABEL: test16: ; KNL: ## BB#0: -; KNL-NEXT: pushq %rbp +; KNL-NEXT: pushq %rbp ## [1:1.00] ; KNL-NEXT: Lcfi0: ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: Lcfi1: ; KNL-NEXT: .cfi_offset %rbp, -16 -; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: movq %rsp, %rbp ## [1:0.25] ; KNL-NEXT: Lcfi2: ; KNL-NEXT: .cfi_def_cfa_register %rbp -; KNL-NEXT: andq $-32, %rsp -; KNL-NEXT: subq $64, %rsp -; KNL-NEXT: movl %edi, (%rsp) -; KNL-NEXT: shrq $32, %rdi -; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; KNL-NEXT: andq $-32, %rsp ## [1:0.25] +; KNL-NEXT: subq $64, %rsp ## [1:0.25] +; KNL-NEXT: movl %edi, (%rsp) ## [1:1.00] +; KNL-NEXT: shrq $32, %rdi ## [1:0.50] +; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp) ## [1:1.00] ; KNL-NEXT: kmovw (%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: movl $1, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; KNL-NEXT: movl $1, %eax ## [1:0.25] +; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## [3:1.00] ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} ; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 -; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 -; KNL-NEXT: movq %rbp, %rsp -; KNL-NEXT: popq %rbp -; KNL-NEXT: retq +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ## [3:1.00] +; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 ## [1:1.00] +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ## [5:0.50] +; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## [1:0.33] +; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 ## [1:0.50] +; KNL-NEXT: movq %rbp, %rsp ## [1:0.25] +; KNL-NEXT: popq %rbp ## [4:0.50] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test16: ; SKX: ## BB#0: @@ -968,15 +1048,15 @@ ; SKX-NEXT: kxnorw %k0, %k0, %k1 ; SKX-NEXT: kshiftrw $15, %k1, %k1 ; SKX-NEXT: vpmovm2b %k1, %zmm0 -; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 +; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: vpmovm2b %k0, %zmm1 -; SKX-NEXT: movl $32, %eax +; SKX-NEXT: movl $32, %eax ## [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; SKX-NEXT: vpmovb2m %zmm0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test16: ; AVX512BW: ## BB#0: @@ -1041,62 +1121,62 @@ ; ; KNL-LABEL: test17: ; KNL: ## BB#0: -; KNL-NEXT: pushq %rbp +; KNL-NEXT: pushq %rbp ## [1:1.00] ; KNL-NEXT: Lcfi3: ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: Lcfi4: ; KNL-NEXT: .cfi_offset %rbp, -16 -; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: movq %rsp, %rbp ## [1:0.25] ; KNL-NEXT: Lcfi5: ; KNL-NEXT: .cfi_def_cfa_register %rbp -; KNL-NEXT: andq $-32, %rsp -; KNL-NEXT: subq $64, %rsp -; KNL-NEXT: movl %edi, (%rsp) -; KNL-NEXT: shrq $32, %rdi -; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; KNL-NEXT: andq $-32, %rsp ## [1:0.25] +; KNL-NEXT: subq $64, %rsp ## [1:0.25] +; KNL-NEXT: movl %edi, (%rsp) ## [1:1.00] +; KNL-NEXT: shrq $32, %rdi ## [1:0.50] +; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp) ## [1:1.00] ; KNL-NEXT: kmovw (%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: xorl %eax, %eax -; KNL-NEXT: cmpl %edx, %esi -; KNL-NEXT: setg %al -; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; KNL-NEXT: xorl %eax, %eax ## [1:0.25] +; KNL-NEXT: cmpl %edx, %esi ## [1:0.25] +; KNL-NEXT: setg %al ## [1:0.50] +; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## [3:1.00] ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} ; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 -; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 -; KNL-NEXT: movq %rbp, %rsp -; KNL-NEXT: popq %rbp -; KNL-NEXT: retq +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ## [3:1.00] +; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 ## [1:1.00] +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ## [5:0.50] +; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## [1:0.33] +; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 ## [1:0.50] +; KNL-NEXT: movq %rbp, %rsp ## [1:0.25] +; KNL-NEXT: popq %rbp ## [4:0.50] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test17: ; SKX: ## BB#0: ; SKX-NEXT: kmovq %rdi, %k0 -; SKX-NEXT: cmpl %edx, %esi -; SKX-NEXT: setg %al -; SKX-NEXT: andl $1, %eax +; SKX-NEXT: cmpl %edx, %esi ## [1:0.25] +; SKX-NEXT: setg %al ## [1:0.50] +; SKX-NEXT: andl $1, %eax ## [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vpmovm2b %k1, %zmm0 -; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 +; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: vpmovm2b %k0, %zmm1 -; SKX-NEXT: movl $32, %eax +; SKX-NEXT: movl $32, %eax ## [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; SKX-NEXT: vpmovb2m %zmm0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test17: ; AVX512BW: ## BB#0: @@ -1173,7 +1253,7 @@ ; KNL-NEXT: kshiftrw $15, %k2, %k2 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7][4:0.50] ; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 @@ -1183,7 +1263,7 @@ ; KNL-NEXT: korw %k0, %k1, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test18: ; SKX: ## BB#0: @@ -1195,7 +1275,7 @@ ; SKX-NEXT: kshiftrw $15, %k1, %k1 ; SKX-NEXT: vpmovm2q %k0, %zmm0 ; SKX-NEXT: vpmovm2q %k1, %zmm1 -; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7][4:0.50] ; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; SKX-NEXT: vpmovq2m %zmm2, %k0 ; SKX-NEXT: kshiftlb $1, %k0, %k0 @@ -1203,8 +1283,8 @@ ; SKX-NEXT: kshiftlb $7, %k2, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: vpmovm2w %k0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test18: ; AVX512BW: ## BB#0: @@ -1261,23 +1341,23 @@ define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone { ; KNL-LABEL: test21: ; KNL: ## BB#0: -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; KNL-NEXT: vpsllw $15, %ymm3, %ymm3 -; KNL-NEXT: vpsraw $15, %ymm3, %ymm3 -; KNL-NEXT: vpand %ymm0, %ymm3, %ymm0 -; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2 -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; KNL-NEXT: vpsllw $15, %ymm2, %ymm2 -; KNL-NEXT: vpsraw $15, %ymm2, %ymm2 -; KNL-NEXT: vpand %ymm1, %ymm2, %ymm1 -; KNL-NEXT: retq +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero[3:1.00] +; KNL-NEXT: vpsllw $15, %ymm3, %ymm3 ## [1:1.00] +; KNL-NEXT: vpsraw $15, %ymm3, %ymm3 ## [1:1.00] +; KNL-NEXT: vpand %ymm0, %ymm3, %ymm0 ## [1:0.33] +; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2 ## [3:1.00] +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero[3:1.00] +; KNL-NEXT: vpsllw $15, %ymm2, %ymm2 ## [1:1.00] +; KNL-NEXT: vpsraw $15, %ymm2, %ymm2 ## [1:1.00] +; KNL-NEXT: vpand %ymm1, %ymm2, %ymm1 ## [1:0.33] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test21: ; SKX: ## BB#0: -; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 +; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 ## [1:1.00] ; SKX-NEXT: vpmovb2m %ymm1, %k1 ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test21: ; AVX512BW: ## BB#0: @@ -1306,18 +1386,18 @@ ; KNL-LABEL: test22: ; KNL: ## BB#0: ; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 -; KNL-NEXT: vpslld $31, %ymm0, %ymm0 +; KNL-NEXT: vpslld $31, %ymm0, %ymm0 ## [1:1.00] ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: retq +; KNL-NEXT: movb %al, (%rdi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test22: ; SKX: ## BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test22: ; AVX512BW: ## BB#0: @@ -1348,15 +1428,15 @@ ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: retq +; KNL-NEXT: movb %al, (%rdi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test23: ; SKX: ## BB#0: -; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test23: ; AVX512BW: ## BB#0: @@ -1383,24 +1463,24 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) { ; KNL-LABEL: store_v1i1: ; KNL: ## BB#0: -; KNL-NEXT: andl $1, %edi +; KNL-NEXT: andl $1, %edi ## [1:0.25] ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kxnorw %k0, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kxorw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: movb %al, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: store_v1i1: ; SKX: ## BB#0: -; SKX-NEXT: andl $1, %edi +; SKX-NEXT: andl $1, %edi ## [1:0.25] ; SKX-NEXT: kmovd %edi, %k0 ; SKX-NEXT: kxnorw %k0, %k0, %k1 ; SKX-NEXT: kshiftrw $15, %k1, %k1 ; SKX-NEXT: kxorw %k1, %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rsi) -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: store_v1i1: ; AVX512BW: ## BB#0: @@ -1430,20 +1510,20 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { ; KNL-LABEL: store_v2i1: ; KNL: ## BB#0: -; KNL-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ## [5:0.50] ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: retq +; KNL-NEXT: movb %al, (%rdi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: store_v2i1: ; SKX: ## BB#0: -; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 ; SKX-NEXT: knotw %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: store_v2i1: ; AVX512BW: ## BB#0: @@ -1471,21 +1551,21 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { ; KNL-LABEL: store_v4i1: ; KNL: ## BB#0: -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 -; KNL-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpslld $31, %ymm0, %ymm0 +; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 ## [4:0.50] +; KNL-NEXT: vpxor %xmm1, %xmm0, %xmm0 ## [1:0.33] +; KNL-NEXT: vpslld $31, %ymm0, %ymm0 ## [1:1.00] ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: retq +; KNL-NEXT: movb %al, (%rdi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: store_v4i1: ; SKX: ## BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 ; SKX-NEXT: knotw %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: store_v4i1: ; AVX512BW: ## BB#0: @@ -1520,16 +1600,16 @@ ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: retq +; KNL-NEXT: movb %al, (%rdi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: store_v8i1: ; SKX: ## BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: vpmovw2m %xmm0, %k0 ; SKX-NEXT: knotb %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: store_v8i1: ; AVX512BW: ## BB#0: @@ -1563,15 +1643,15 @@ ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: kmovw %k0, (%rdi) -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: store_v16i1: ; SKX: ## BB#0: -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: vpmovb2m %xmm0, %k0 ; SKX-NEXT: knotw %k0, %k0 ; SKX-NEXT: kmovw %k0, (%rdi) -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: store_v16i1: ; AVX512BW: ## BB#0: @@ -1612,29 +1692,29 @@ define void @f1(i32 %c) { ; KNL-LABEL: f1: ; KNL: ## BB#0: ## %entry -; KNL-NEXT: movzbl {{.*}}(%rip), %edi -; KNL-NEXT: movl %edi, %eax -; KNL-NEXT: andl $1, %eax +; KNL-NEXT: movzbl {{.*}}(%rip), %edi ## [4:0.50] +; KNL-NEXT: movl %edi, %eax ## [1:0.25] +; KNL-NEXT: andl $1, %eax ## [1:0.25] ; KNL-NEXT: kmovw %eax, %k0 ; KNL-NEXT: kxnorw %k0, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kxorw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, {{.*}}(%rip) -; KNL-NEXT: xorl $1, %edi +; KNL-NEXT: movb %al, {{.*}}(%rip) ## [1:1.00] +; KNL-NEXT: xorl $1, %edi ## [1:0.25] ; KNL-NEXT: jmp _f2 ## TAILCALL ; ; SKX-LABEL: f1: ; SKX: ## BB#0: ## %entry -; SKX-NEXT: movzbl {{.*}}(%rip), %edi -; SKX-NEXT: movl %edi, %eax -; SKX-NEXT: andl $1, %eax +; SKX-NEXT: movzbl {{.*}}(%rip), %edi ## [4:0.50] +; SKX-NEXT: movl %edi, %eax ## [1:0.25] +; SKX-NEXT: andl $1, %eax ## [1:0.25] ; SKX-NEXT: kmovd %eax, %k0 ; SKX-NEXT: kxnorw %k0, %k0, %k1 ; SKX-NEXT: kshiftrw $15, %k1, %k1 ; SKX-NEXT: kxorw %k1, %k0, %k0 ; SKX-NEXT: kmovb %k0, {{.*}}(%rip) -; SKX-NEXT: xorl $1, %edi +; SKX-NEXT: xorl $1, %edi ## [1:0.25] ; SKX-NEXT: jmp _f2 ## TAILCALL ; ; AVX512BW-LABEL: f1: @@ -1675,22 +1755,58 @@ declare void @f2(i32) #1 define void @store_i16_i1(i16 %x, i1 *%y) { -; CHECK-LABEL: store_i16_i1: -; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: movb %dil, (%rsi) -; CHECK-NEXT: retq +; KNL-LABEL: store_i16_i1: +; KNL: ## BB#0: +; KNL-NEXT: andl $1, %edi ## [1:0.25] +; KNL-NEXT: movb %dil, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] +; +; SKX-LABEL: store_i16_i1: +; SKX: ## BB#0: +; SKX-NEXT: andl $1, %edi ## [1:0.25] +; SKX-NEXT: movb %dil, (%rsi) ## [1:1.00] +; SKX-NEXT: retq ## [1:1.00] +; +; AVX512BW-LABEL: store_i16_i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movb %dil, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: store_i16_i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movb %dil, (%rsi) +; AVX512DQ-NEXT: retq %c = trunc i16 %x to i1 store i1 %c, i1* %y ret void } define void @store_i8_i1(i8 %x, i1 *%y) { -; CHECK-LABEL: store_i8_i1: -; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: movb %dil, (%rsi) -; CHECK-NEXT: retq +; KNL-LABEL: store_i8_i1: +; KNL: ## BB#0: +; KNL-NEXT: andl $1, %edi ## [1:0.25] +; KNL-NEXT: movb %dil, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] +; +; SKX-LABEL: store_i8_i1: +; SKX: ## BB#0: +; SKX-NEXT: andl $1, %edi ## [1:0.25] +; SKX-NEXT: movb %dil, (%rsi) ## [1:1.00] +; SKX-NEXT: retq ## [1:1.00] +; +; AVX512BW-LABEL: store_i8_i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: movb %dil, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: store_i8_i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: movb %dil, (%rsi) +; AVX512DQ-NEXT: retq %c = trunc i8 %x to i1 store i1 %c, i1* %y ret void @@ -1699,16 +1815,17 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) { ; KNL-LABEL: test_build_vec_v32i1: ; KNL: ## BB#0: -; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; KNL-NEXT: retq +; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ## [5:1.00] +; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ## [5:1.00] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test_build_vec_v32i1: ; SKX: ## BB#0: ; SKX-NEXT: movl $1497715861, %eax ## imm = 0x59455495 +; SKX-NEXT: ## [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test_build_vec_v32i1: ; AVX512BW: ## BB#0: @@ -1729,16 +1846,17 @@ define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) { ; KNL-LABEL: test_build_vec_v64i1: ; KNL: ## BB#0: -; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; KNL-NEXT: retq +; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ## [5:1.00] +; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ## [5:1.00] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test_build_vec_v64i1: ; SKX: ## BB#0: ; SKX-NEXT: movabsq $6432645796886517060, %rax ## imm = 0x5945594549549544 +; SKX-NEXT: ## [1:0.25] ; SKX-NEXT: kmovq %rax, %k1 ; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test_build_vec_v64i1: ; AVX512BW: ## BB#0: @@ -1759,36 +1877,36 @@ define void @ktest_1(<8 x double> %in, double * %base) { ; KNL-LABEL: ktest_1: ; KNL: ## BB#0: -; KNL-NEXT: vmovupd (%rdi), %zmm1 +; KNL-NEXT: vmovupd (%rdi), %zmm1 ## [4:0.50] ; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; KNL-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} +; KNL-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} ## [4:0.50] ; KNL-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb %al, %al -; KNL-NEXT: je LBB41_2 +; KNL-NEXT: testb %al, %al ## [1:0.25] +; KNL-NEXT: je LBB41_2 ## [1:0.50] ; KNL-NEXT: ## BB#1: ## %L1 ; KNL-NEXT: vmovapd %zmm0, (%rdi) -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; KNL-NEXT: LBB41_2: ## %L2 ; KNL-NEXT: vmovapd %zmm0, 8(%rdi) -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: ktest_1: ; SKX: ## BB#0: -; SKX-NEXT: vmovupd (%rdi), %zmm1 +; SKX-NEXT: vmovupd (%rdi), %zmm1 ## [4:0.50] ; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} +; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} ## [4:0.50] ; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} ; SKX-NEXT: ktestb %k0, %k0 -; SKX-NEXT: je LBB41_2 +; SKX-NEXT: je LBB41_2 ## [1:0.50] ; SKX-NEXT: ## BB#1: ## %L1 ; SKX-NEXT: vmovapd %zmm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; SKX-NEXT: LBB41_2: ## %L2 ; SKX-NEXT: vmovapd %zmm0, 8(%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: ktest_1: ; AVX512BW: ## BB#0: @@ -1855,18 +1973,18 @@ ; ; KNL-LABEL: ktest_2: ; KNL: ## BB#0: -; KNL-NEXT: pushq %rbp +; KNL-NEXT: pushq %rbp ## [1:1.00] ; KNL-NEXT: Lcfi6: ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: Lcfi7: ; KNL-NEXT: .cfi_offset %rbp, -16 -; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: movq %rsp, %rbp ## [1:0.25] ; KNL-NEXT: Lcfi8: ; KNL-NEXT: .cfi_def_cfa_register %rbp -; KNL-NEXT: andq $-32, %rsp -; KNL-NEXT: subq $32, %rsp -; KNL-NEXT: vmovups (%rdi), %zmm2 -; KNL-NEXT: vmovups 64(%rdi), %zmm3 +; KNL-NEXT: andq $-32, %rsp ## [1:0.25] +; KNL-NEXT: subq $32, %rsp ## [1:0.25] +; KNL-NEXT: vmovups (%rdi), %zmm2 ## [4:0.50] +; KNL-NEXT: vmovups 64(%rdi), %zmm3 ## [4:0.50] ; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k1 ; KNL-NEXT: kshiftlw $14, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 @@ -1874,63 +1992,63 @@ ; KNL-NEXT: kshiftlw $15, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: vmovd %ecx, %xmm3 -; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; KNL-NEXT: vmovd %ecx, %xmm3 ## [1:1.00] +; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $13, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $12, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $11, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $10, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $9, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $8, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $7, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $6, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $5, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $4, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $3, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $2, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $1, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftrw $15, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k2 ; KNL-NEXT: kshiftlw $14, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 @@ -1938,66 +2056,66 @@ ; KNL-NEXT: kshiftlw $15, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: vmovd %ecx, %xmm2 -; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; KNL-NEXT: vmovd %ecx, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftlw $13, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftlw $12, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftlw $11, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftlw $10, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftlw $9, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftlw $8, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftlw $7, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftlw $6, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftlw $5, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftlw $4, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftlw $3, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftlw $2, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftlw $1, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kshiftrw $15, %k2, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; KNL-NEXT: vmovups 4(%rdi), %zmm3 {%k2} {z} -; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k1} {z} +; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ## [1:1.00] +; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ## [3:1.00] +; KNL-NEXT: vmovups 4(%rdi), %zmm3 {%k2} {z} ## [4:0.50] +; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k1} {z} ## [4:0.50] ; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 @@ -2005,63 +2123,63 @@ ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vmovd %ecx, %xmm4 -; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; KNL-NEXT: vmovd %ecx, %xmm4 ## [1:1.00] +; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 ## [1:1.00] ; KNL-NEXT: vcmpltps %zmm3, %zmm0, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 @@ -2069,66 +2187,66 @@ ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vmovd %ecx, %xmm3 -; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; KNL-NEXT: vmovd %ecx, %xmm3 ## [1:1.00] +; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ## [1:1.00] ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2 -; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3 +; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 ## [1:1.00] +; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ## [3:1.00] +; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2 ## [1:0.33] +; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3 ## [3:1.00] ; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 ; KNL-NEXT: vpslld $31, %zmm3, %zmm3 ; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 @@ -2137,45 +2255,45 @@ ; KNL-NEXT: vpslld $31, %zmm2, %zmm2 ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 ; KNL-NEXT: kmovw %k0, (%rsp) -; KNL-NEXT: cmpl $0, (%rsp) -; KNL-NEXT: je LBB42_2 +; KNL-NEXT: cmpl $0, (%rsp) ## [5:1.00] +; KNL-NEXT: je LBB42_2 ## [1:0.50] ; KNL-NEXT: ## BB#1: ## %L1 ; KNL-NEXT: vmovaps %zmm0, (%rdi) ; KNL-NEXT: vmovaps %zmm1, 64(%rdi) -; KNL-NEXT: jmp LBB42_3 +; KNL-NEXT: jmp LBB42_3 ## [1:0.50] ; KNL-NEXT: LBB42_2: ## %L2 ; KNL-NEXT: vmovaps %zmm0, 4(%rdi) ; KNL-NEXT: vmovaps %zmm1, 68(%rdi) ; KNL-NEXT: LBB42_3: ## %End -; KNL-NEXT: movq %rbp, %rsp -; KNL-NEXT: popq %rbp -; KNL-NEXT: retq +; KNL-NEXT: movq %rbp, %rsp ## [1:0.25] +; KNL-NEXT: popq %rbp ## [4:0.50] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: ktest_2: ; SKX: ## BB#0: -; SKX-NEXT: vmovups (%rdi), %zmm2 -; SKX-NEXT: vmovups 64(%rdi), %zmm3 +; SKX-NEXT: vmovups (%rdi), %zmm2 ## [4:0.50] +; SKX-NEXT: vmovups 64(%rdi), %zmm3 ## [4:0.50] ; SKX-NEXT: vcmpltps %zmm0, %zmm2, %k1 ; SKX-NEXT: vcmpltps %zmm1, %zmm3, %k2 ; SKX-NEXT: kunpckwd %k1, %k2, %k0 -; SKX-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} -; SKX-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} +; SKX-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} ## [4:0.50] +; SKX-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} ## [4:0.50] ; SKX-NEXT: vcmpltps %zmm3, %zmm0, %k1 ; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2 ; SKX-NEXT: kunpckwd %k1, %k2, %k1 ; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: ktestd %k0, %k0 -; SKX-NEXT: je LBB42_2 +; SKX-NEXT: je LBB42_2 ## [1:0.50] ; SKX-NEXT: ## BB#1: ## %L1 ; SKX-NEXT: vmovaps %zmm0, (%rdi) ; SKX-NEXT: vmovaps %zmm1, 64(%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; SKX-NEXT: LBB42_2: ## %L2 ; SKX-NEXT: vmovaps %zmm0, 4(%rdi) ; SKX-NEXT: vmovaps %zmm1, 68(%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: ktest_2: ; AVX512BW: ## BB#0: @@ -2531,16 +2649,16 @@ define <8 x i64> @load_8i1(<8 x i1>* %a) { ; KNL-LABEL: load_8i1: ; KNL: ## BB#0: -; KNL-NEXT: movzbl (%rdi), %eax +; KNL-NEXT: movzbl (%rdi), %eax ## [4:0.50] ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: load_8i1: ; SKX: ## BB#0: ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: vpmovm2q %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: load_8i1: ; AVX512BW: ## BB#0: @@ -2564,13 +2682,13 @@ ; KNL: ## BB#0: ; KNL-NEXT: kmovw (%rdi), %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: load_16i1: ; SKX: ## BB#0: ; SKX-NEXT: kmovw (%rdi), %k0 ; SKX-NEXT: vpmovm2d %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: load_16i1: ; AVX512BW: ## BB#0: @@ -2591,17 +2709,17 @@ define <2 x i16> @load_2i1(<2 x i1>* %a) { ; KNL-LABEL: load_2i1: ; KNL: ## BB#0: -; KNL-NEXT: movzbl (%rdi), %eax +; KNL-NEXT: movzbl (%rdi), %eax ## [4:0.50] ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: load_2i1: ; SKX: ## BB#0: ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: vpmovm2q %k0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: load_2i1: ; AVX512BW: ## BB#0: @@ -2627,18 +2745,18 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) { ; KNL-LABEL: load_4i1: ; KNL: ## BB#0: -; KNL-NEXT: movzbl (%rdi), %eax +; KNL-NEXT: movzbl (%rdi), %eax ## [4:0.50] ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: load_4i1: ; SKX: ## BB#0: ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: vpmovm2d %k0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: load_4i1: ; AVX512BW: ## BB#0: @@ -2671,13 +2789,13 @@ ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vpmovdw %zmm1, %ymm1 -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: load_32i1: ; SKX: ## BB#0: ; SKX-NEXT: kmovd (%rdi), %k0 ; SKX-NEXT: vpmovm2w %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: load_32i1: ; AVX512BW: ## BB#0: @@ -2710,19 +2828,19 @@ ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ## [3:1.00] ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k4} {z} ; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; KNL-NEXT: retq +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ## [3:1.00] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: load_64i1: ; SKX: ## BB#0: ; SKX-NEXT: kmovq (%rdi), %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: load_64i1: ; AVX512BW: ## BB#0: @@ -2759,15 +2877,15 @@ ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: retq +; KNL-NEXT: movb %al, (%rdi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: store_8i1: ; SKX: ## BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: vpmovw2m %xmm0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: store_8i1: ; AVX512BW: ## BB#0: @@ -2797,15 +2915,15 @@ ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: retq +; KNL-NEXT: movb %al, (%rdi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: store_8i1_1: ; SKX: ## BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: vpmovw2m %xmm0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: store_8i1_1: ; AVX512BW: ## BB#0: @@ -2836,14 +2954,14 @@ ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rdi) -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: store_16i1: ; SKX: ## BB#0: -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 ## [1:1.00] ; SKX-NEXT: vpmovb2m %xmm0, %k0 ; SKX-NEXT: kmovw %k0, (%rdi) -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: store_16i1: ; AVX512BW: ## BB#0: @@ -2868,7 +2986,7 @@ define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) { ; KNL-LABEL: store_32i1: ; KNL: ## BB#0: -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 ## [3:1.00] ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 ; KNL-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 @@ -2877,15 +2995,15 @@ ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rdi) -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: store_32i1: ; SKX: ## BB#0: -; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 +; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 ## [1:1.00] ; SKX-NEXT: vpmovb2m %ymm0, %k0 ; SKX-NEXT: kmovd %k0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: store_32i1: ; AVX512BW: ## BB#0: @@ -2927,15 +3045,15 @@ ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rdi) -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: store_32i1_1: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $15, %zmm0, %zmm0 ; SKX-NEXT: vpmovw2m %zmm0, %k0 ; SKX-NEXT: kmovd %k0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: store_32i1_1: ; AVX512BW: ## BB#0: @@ -2971,22 +3089,22 @@ ; ; KNL-LABEL: store_64i1: ; KNL: ## BB#0: -; KNL-NEXT: pushq %rbp +; KNL-NEXT: pushq %rbp ## [1:1.00] ; KNL-NEXT: Lcfi9: ; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: pushq %r15 +; KNL-NEXT: pushq %r15 ## [1:1.00] ; KNL-NEXT: Lcfi10: ; KNL-NEXT: .cfi_def_cfa_offset 24 -; KNL-NEXT: pushq %r14 +; KNL-NEXT: pushq %r14 ## [1:1.00] ; KNL-NEXT: Lcfi11: ; KNL-NEXT: .cfi_def_cfa_offset 32 -; KNL-NEXT: pushq %r13 +; KNL-NEXT: pushq %r13 ## [1:1.00] ; KNL-NEXT: Lcfi12: ; KNL-NEXT: .cfi_def_cfa_offset 40 -; KNL-NEXT: pushq %r12 +; KNL-NEXT: pushq %r12 ## [1:1.00] ; KNL-NEXT: Lcfi13: ; KNL-NEXT: .cfi_def_cfa_offset 48 -; KNL-NEXT: pushq %rbx +; KNL-NEXT: pushq %rbx ## [1:1.00] ; KNL-NEXT: Lcfi14: ; KNL-NEXT: .cfi_def_cfa_offset 56 ; KNL-NEXT: Lcfi15: @@ -3054,26 +3172,26 @@ ; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vmovd %r9d, %xmm3 +; KNL-NEXT: vmovd %r9d, %xmm3 ## [1:1.00] ; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2 -; KNL-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $4, %r14d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $5, %r15d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $6, %r12d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $7, %r13d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $8, %ebx, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $9, %ebp, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $14, %r9d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $4, %r14d, %xmm2, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $5, %r15d, %xmm2, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $6, %r12d, %xmm2, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $7, %r13d, %xmm2, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $8, %ebx, %xmm2, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $9, %ebp, %xmm2, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2 ## [1:1.00] +; KNL-NEXT: vpinsrb $14, %r9d, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ## [1:1.00] ; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 ; KNL-NEXT: vpslld $31, %zmm2, %zmm2 ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 @@ -3122,26 +3240,26 @@ ; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: kshiftlw $1, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vmovd %r10d, %xmm2 +; KNL-NEXT: vmovd %r10d, %xmm2 ## [1:1.00] ; KNL-NEXT: kmovw %k0, %r10d ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL-NEXT: kshiftrw $15, %k2, %k0 -; KNL-NEXT: vpinsrb $1, %r8d, %xmm2, %xmm1 -; KNL-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $3, %r11d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $4, %r14d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $13, %edx, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $1, %r8d, %xmm2, %xmm1 ## [1:1.00] +; KNL-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vpinsrb $3, %r11d, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vpinsrb $4, %r14d, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vpinsrb $13, %edx, %xmm1, %xmm1 ## [1:1.00] +; KNL-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1 ## [1:1.00] ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ## [1:1.00] ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 ; KNL-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 @@ -3190,26 +3308,26 @@ ; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: kshiftlw $1, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vmovd %r10d, %xmm1 +; KNL-NEXT: vmovd %r10d, %xmm1 ## [1:1.00] ; KNL-NEXT: kmovw %k0, %r10d ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm0 -; KNL-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0 ## [1:1.00] ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ## [1:1.00] ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 @@ -3258,44 +3376,44 @@ ; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vmovd %r9d, %xmm0 +; KNL-NEXT: vmovd %r9d, %xmm0 ## [1:1.00] ; KNL-NEXT: kmovw %k1, %r9d -; KNL-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## [1:1.00] ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ## [1:1.00] +; KNL-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0 ## [1:1.00] ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ## [1:1.00] ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rdi) -; KNL-NEXT: popq %rbx -; KNL-NEXT: popq %r12 -; KNL-NEXT: popq %r13 -; KNL-NEXT: popq %r14 -; KNL-NEXT: popq %r15 -; KNL-NEXT: popq %rbp -; KNL-NEXT: retq +; KNL-NEXT: popq %rbx ## [4:0.50] +; KNL-NEXT: popq %r12 ## [4:0.50] +; KNL-NEXT: popq %r13 ## [4:0.50] +; KNL-NEXT: popq %r14 ## [4:0.50] +; KNL-NEXT: popq %r15 ## [4:0.50] +; KNL-NEXT: popq %rbp ## [4:0.50] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: store_64i1: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %zmm0, %zmm0 ; SKX-NEXT: vpmovb2m %zmm0, %k0 ; SKX-NEXT: kmovq %k0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: store_64i1: ; AVX512BW: ## BB#0: @@ -3632,21 +3750,21 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { ; KNL-LABEL: test_bitcast_v8i1_zext: ; KNL: ## BB#0: -; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## [?:0.000000e+00] +; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## [?:0.000000e+00] ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movzbl %al, %eax -; KNL-NEXT: addl %eax, %eax -; KNL-NEXT: retq +; KNL-NEXT: movzbl %al, %eax ## [1:0.25] +; KNL-NEXT: addl %eax, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test_bitcast_v8i1_zext: ; SKX: ## BB#0: -; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## [?:0.000000e+00] +; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## [?:0.000000e+00] ; SKX-NEXT: kmovb %k0, %eax -; SKX-NEXT: addl %eax, %eax -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: addl %eax, %eax ## [1:0.25] +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test_bitcast_v8i1_zext: ; AVX512BW: ## BB#0: @@ -3677,20 +3795,20 @@ define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) { ; KNL-LABEL: test_bitcast_v16i1_zext: ; KNL: ## BB#0: -; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## [?:0.000000e+00] +; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## [?:0.000000e+00] ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: addl %eax, %eax -; KNL-NEXT: retq +; KNL-NEXT: addl %eax, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test_bitcast_v16i1_zext: ; SKX: ## BB#0: -; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## [?:0.000000e+00] +; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## [?:0.000000e+00] ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: addl %eax, %eax -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: addl %eax, %eax ## [1:0.25] +; SKX-NEXT: vzeroupper ## [1:0.00] +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test_bitcast_v16i1_zext: ; AVX512BW: ## BB#0: @@ -3724,7 +3842,7 @@ ; KNL-NEXT: kxorw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AX %AX %EAX -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test_v16i1_add: ; SKX: ## BB#0: @@ -3733,7 +3851,7 @@ ; SKX-NEXT: kxorw %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test_v16i1_add: ; AVX512BW: ## BB#0: @@ -3767,7 +3885,7 @@ ; KNL-NEXT: kxorw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AX %AX %EAX -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test_v16i1_sub: ; SKX: ## BB#0: @@ -3776,7 +3894,7 @@ ; SKX-NEXT: kxorw %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test_v16i1_sub: ; AVX512BW: ## BB#0: @@ -3810,7 +3928,7 @@ ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AX %AX %EAX -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test_v16i1_mul: ; SKX: ## BB#0: @@ -3819,7 +3937,7 @@ ; SKX-NEXT: kandw %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test_v16i1_mul: ; AVX512BW: ## BB#0: @@ -3853,7 +3971,7 @@ ; KNL-NEXT: kxorw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AL %AL %EAX -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test_v8i1_add: ; SKX: ## BB#0: @@ -3862,7 +3980,7 @@ ; SKX-NEXT: kxorb %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test_v8i1_add: ; AVX512BW: ## BB#0: @@ -3896,7 +4014,7 @@ ; KNL-NEXT: kxorw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AL %AL %EAX -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test_v8i1_sub: ; SKX: ## BB#0: @@ -3905,7 +4023,7 @@ ; SKX-NEXT: kxorb %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test_v8i1_sub: ; AVX512BW: ## BB#0: @@ -3939,7 +4057,7 @@ ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AL %AL %EAX -; KNL-NEXT: retq +; KNL-NEXT: retq ## [1:1.00] ; ; SKX-LABEL: test_v8i1_mul: ; SKX: ## BB#0: @@ -3948,7 +4066,7 @@ ; SKX-NEXT: kandb %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: retq +; SKX-NEXT: retq ## [1:1.00] ; ; AVX512BW-LABEL: test_v8i1_mul: ; AVX512BW: ## BB#0: Index: test/CodeGen/X86/avx512-mask-zext-bugfix.ll =================================================================== --- test/CodeGen/X86/avx512-mask-zext-bugfix.ll +++ test/CodeGen/X86/avx512-mask-zext-bugfix.ll @@ -17,15 +17,70 @@ define void @test_xmm(i32 %shift, i32 %mulp, <2 x i64> %a,i8* %arraydecay,i8* %fname){ ; CHECK-LABEL: test_xmm: ; CHECK: ## BB#0: -; CHECK: callq _calc_expected_mask_val -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: movw %dx, %r9w -; CHECK-NEXT: movzwl %r9w, %esi -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload +; CHECK-NEXT: subq $72, %rsp ## [1:0.25] +; CHECK-NEXT: Lcfi0: +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: movl $4, %eax ## [1:0.25] +; CHECK-NEXT: vpmovw2m %xmm0, %k0 +; CHECK-NEXT: movl $2, %esi ## [1:0.25] +; CHECK-NEXT: movl $8, %edi ## [1:0.25] +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%rsp) ## 4-byte Spill [1:1.00] +; CHECK-NEXT: ## [1:1.00] +; CHECK-NEXT: movq %rdx, %rdi ## [1:0.25] +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r8d ## 4-byte Reload [4:0.50] +; CHECK-NEXT: ## [4:0.50] +; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ## 8-byte Spill [1:1.00] +; CHECK-NEXT: ## [1:1.00] +; CHECK-NEXT: movl %r8d, %edx ## [1:0.25] +; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill [1:1.00] +; CHECK-NEXT: ## [1:1.00] +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:0.00] +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill [1:1.00] +; CHECK-NEXT: ## [1:1.00] +; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill [1:0.00] +; CHECK-NEXT: callq _calc_expected_mask_val +; CHECK-NEXT: movl %eax, %edx ## [1:0.25] +; CHECK-NEXT: movw %dx, %r9w ## [1:0.25] +; CHECK-NEXT: movzwl %r9w, %esi ## [1:0.25] +; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload [4:0.00] ; CHECK-NEXT: kmovb %k0, %edi -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdx ## 8-byte Reload -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdx ## 8-byte Reload [4:0.50] +; CHECK-NEXT: ## [4:0.50] +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload [4:0.50] +; CHECK-NEXT: ## [4:0.50] ; CHECK-NEXT: callq _check_mask16 +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:0.50] +; CHECK-NEXT: ## [4:0.50] +; CHECK-NEXT: vpmovd2m %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %esi +; CHECK-NEXT: movb %sil, %r10b ## [1:0.25] +; CHECK-NEXT: movzbl %r10b, %esi ## [1:0.25] +; CHECK-NEXT: movw %si, %r9w ## [1:0.25] +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ## 8-byte Reload [4:0.50] +; CHECK-NEXT: ## [4:0.50] +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %esi ## 4-byte Reload [4:0.50] +; CHECK-NEXT: ## [4:0.50] +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx ## 4-byte Reload [4:0.50] +; CHECK-NEXT: ## [4:0.50] +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill [1:1.00] +; CHECK-NEXT: ## [1:1.00] +; CHECK-NEXT: movw %r9w, {{[0-9]+}}(%rsp) ## 2-byte Spill [1:1.00] +; CHECK-NEXT: ## [1:1.00] +; CHECK-NEXT: callq _calc_expected_mask_val +; CHECK-NEXT: movw %ax, %r9w ## [1:0.25] +; CHECK-NEXT: movw {{[0-9]+}}(%rsp), %r11w ## 2-byte Reload [5:0.50] +; CHECK-NEXT: ## [5:0.50] +; CHECK-NEXT: movzwl %r11w, %edi ## [1:0.25] +; CHECK-NEXT: movzwl %r9w, %esi ## [1:0.25] +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdx ## 8-byte Reload [4:0.50] +; CHECK-NEXT: ## [4:0.50] +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload [4:0.50] +; CHECK-NEXT: ## [4:0.50] +; CHECK-NEXT: callq _check_mask16 +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill [1:1.00] +; CHECK-NEXT: ## [1:1.00] +; CHECK-NEXT: addq $72, %rsp ## [1:0.25] +; CHECK-NEXT: retq ## [1:1.00] %d2 = bitcast <2 x i64> %a to <8 x i16> %m2 = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %d2) %conv7 = zext i8 %m2 to i16 Index: test/CodeGen/X86/avx512-mov.ll =================================================================== --- test/CodeGen/X86/avx512-mov.ll +++ test/CodeGen/X86/avx512-mov.ll @@ -4,8 +4,8 @@ define i32 @test1(float %x) { ; CHECK-LABEL: test1: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovd %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovd %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0xc0][1:0.25] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = bitcast float %x to i32 ret i32 %res } @@ -13,8 +13,8 @@ define <4 x i32> @test2(i32 %x) { ; CHECK-LABEL: test2: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovd %edi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovd %edi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = insertelement <4 x i32>undef, i32 %x, i32 0 ret <4 x i32>%res } @@ -22,8 +22,8 @@ define <2 x i64> @test3(i64 %x) { ; CHECK-LABEL: test3: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovq %rdi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x6e,0xc7] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovq %rdi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x6e,0xc7][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = insertelement <2 x i64>undef, i64 %x, i32 0 ret <2 x i64>%res } @@ -31,9 +31,8 @@ define <4 x i32> @test4(i32* %x) { ; CHECK-LABEL: test4: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] -; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]xmm0 = mem[0],zero,zero,zero[4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %y = load i32, i32* %x %res = insertelement <4 x i32>undef, i32 %y, i32 0 ret <4 x i32>%res @@ -42,8 +41,8 @@ define void @test5(float %x, float* %y) { ; CHECK-LABEL: test5: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovss %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovss %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] store float %x, float* %y, align 4 ret void } @@ -51,8 +50,8 @@ define void @test6(double %x, double* %y) { ; CHECK-LABEL: test6: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovsd %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovsd %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] store double %x, double* %y, align 8 ret void } @@ -60,9 +59,8 @@ define float @test7(i32* %x) { ; CHECK-LABEL: test7: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] -; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]xmm0 = mem[0],zero,zero,zero[4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %y = load i32, i32* %x %res = bitcast i32 %y to float ret float %res @@ -71,8 +69,8 @@ define i32 @test8(<4 x i32> %x) { ; CHECK-LABEL: test8: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovd %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovd %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0xc0][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = extractelement <4 x i32> %x, i32 0 ret i32 %res } @@ -80,8 +78,8 @@ define i64 @test9(<2 x i64> %x) { ; CHECK-LABEL: test9: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovq %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x7e,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovq %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x7e,0xc0][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = extractelement <2 x i64> %x, i32 0 ret i64 %res } @@ -89,9 +87,8 @@ define <4 x i32> @test10(i32* %x) { ; CHECK-LABEL: test10: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] -; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]xmm0 = mem[0],zero,zero,zero[4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %y = load i32, i32* %x, align 4 %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0 ret <4 x i32>%res @@ -100,9 +97,8 @@ define <4 x float> @test11(float* %x) { ; CHECK-LABEL: test11: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] -; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]xmm0 = mem[0],zero,zero,zero[4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %y = load float, float* %x, align 4 %res = insertelement <4 x float>zeroinitializer, float %y, i32 0 ret <4 x float>%res @@ -111,9 +107,8 @@ define <2 x double> @test12(double* %x) { ; CHECK-LABEL: test12: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovsd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07] -; CHECK-NEXT: ## xmm0 = mem[0],zero -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovsd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]xmm0 = mem[0],zero[4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %y = load double, double* %x, align 8 %res = insertelement <2 x double>zeroinitializer, double %y, i32 0 ret <2 x double>%res @@ -122,8 +117,8 @@ define <2 x i64> @test13(i64 %x) { ; CHECK-LABEL: test13: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovq %rdi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x6e,0xc7] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovq %rdi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x6e,0xc7][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = insertelement <2 x i64>zeroinitializer, i64 %x, i32 0 ret <2 x i64>%res } @@ -131,8 +126,8 @@ define <4 x i32> @test14(i32 %x) { ; CHECK-LABEL: test14: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovd %edi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovd %edi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = insertelement <4 x i32>zeroinitializer, i32 %x, i32 0 ret <4 x i32>%res } @@ -140,9 +135,8 @@ define <4 x i32> @test15(i32* %x) { ; CHECK-LABEL: test15: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] -; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]xmm0 = mem[0],zero,zero,zero[4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %y = load i32, i32* %x, align 4 %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0 ret <4 x i32>%res @@ -151,8 +145,8 @@ define <16 x i32> @test16(i8 * %addr) { ; CHECK-LABEL: test16: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <16 x i32>* %res = load <16 x i32>, <16 x i32>* %vaddr, align 1 ret <16 x i32>%res @@ -161,8 +155,8 @@ define <16 x i32> @test17(i8 * %addr) { ; CHECK-LABEL: test17: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <16 x i32>* %res = load <16 x i32>, <16 x i32>* %vaddr, align 64 ret <16 x i32>%res @@ -172,7 +166,7 @@ ; CHECK-LABEL: test18: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <8 x i64>* store <8 x i64>%data, <8 x i64>* %vaddr, align 64 ret void @@ -182,7 +176,7 @@ ; CHECK-LABEL: test19: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <16 x i32>* store <16 x i32>%data, <16 x i32>* %vaddr, align 1 ret void @@ -192,7 +186,7 @@ ; CHECK-LABEL: test20: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <16 x i32>* store <16 x i32>%data, <16 x i32>* %vaddr, align 64 ret void @@ -201,8 +195,8 @@ define <8 x i64> @test21(i8 * %addr) { ; CHECK-LABEL: test21: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <8 x i64>* %res = load <8 x i64>, <8 x i64>* %vaddr, align 64 ret <8 x i64>%res @@ -212,7 +206,7 @@ ; CHECK-LABEL: test22: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <8 x i64>* store <8 x i64>%data, <8 x i64>* %vaddr, align 1 ret void @@ -221,8 +215,8 @@ define <8 x i64> @test23(i8 * %addr) { ; CHECK-LABEL: test23: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <8 x i64>* %res = load <8 x i64>, <8 x i64>* %vaddr, align 1 ret <8 x i64>%res @@ -232,7 +226,7 @@ ; CHECK-LABEL: test24: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <8 x double>* store <8 x double>%data, <8 x double>* %vaddr, align 64 ret void @@ -241,8 +235,8 @@ define <8 x double> @test25(i8 * %addr) { ; CHECK-LABEL: test25: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <8 x double>* %res = load <8 x double>, <8 x double>* %vaddr, align 64 ret <8 x double>%res @@ -252,7 +246,7 @@ ; CHECK-LABEL: test26: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <16 x float>* store <16 x float>%data, <16 x float>* %vaddr, align 64 ret void @@ -261,8 +255,8 @@ define <16 x float> @test27(i8 * %addr) { ; CHECK-LABEL: test27: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <16 x float>* %res = load <16 x float>, <16 x float>* %vaddr, align 64 ret <16 x float>%res @@ -272,7 +266,7 @@ ; CHECK-LABEL: test28: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <8 x double>* store <8 x double>%data, <8 x double>* %vaddr, align 1 ret void @@ -281,8 +275,8 @@ define <8 x double> @test29(i8 * %addr) { ; CHECK-LABEL: test29: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <8 x double>* %res = load <8 x double>, <8 x double>* %vaddr, align 1 ret <8 x double>%res @@ -292,7 +286,7 @@ ; CHECK-LABEL: test30: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <16 x float>* store <16 x float>%data, <16 x float>* %vaddr, align 1 ret void @@ -301,8 +295,8 @@ define <16 x float> @test31(i8 * %addr) { ; CHECK-LABEL: test31: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %vaddr = bitcast i8* %addr to <16 x float>* %res = load <16 x float>, <16 x float>* %vaddr, align 1 ret <16 x float>%res @@ -311,10 +305,10 @@ define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; CHECK-LABEL: test32: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04] -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04][?:0.000000e+00] +; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* %r = load <16 x i32>, <16 x i32>* %vaddr, align 64 @@ -325,10 +319,10 @@ define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; CHECK-LABEL: test33: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04] -; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x6f,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04][?:0.000000e+00] +; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x6f,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* %r = load <16 x i32>, <16 x i32>* %vaddr, align 1 @@ -339,10 +333,10 @@ define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) { ; CHECK-LABEL: test34: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04] -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04][?:0.000000e+00] +; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* %r = load <16 x i32>, <16 x i32>* %vaddr, align 64 @@ -353,10 +347,10 @@ define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) { ; CHECK-LABEL: test35: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04] -; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x6f,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04][?:0.000000e+00] +; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x6f,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* %r = load <16 x i32>, <16 x i32>* %vaddr, align 1 @@ -367,10 +361,10 @@ define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; CHECK-LABEL: test36: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04] -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04][?:0.000000e+00] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* %r = load <8 x i64>, <8 x i64>* %vaddr, align 64 @@ -381,10 +375,10 @@ define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; CHECK-LABEL: test37: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] -; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04] -; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x6f,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04][?:0.000000e+00] +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x6f,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* %r = load <8 x i64>, <8 x i64>* %vaddr, align 1 @@ -395,10 +389,10 @@ define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) { ; CHECK-LABEL: test38: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9] -; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04] -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04][?:0.000000e+00] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* %r = load <8 x i64>, <8 x i64>* %vaddr, align 64 @@ -409,10 +403,10 @@ define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) { ; CHECK-LABEL: test39: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9] -; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04] -; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xc9,0x6f,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04][?:0.000000e+00] +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xc9,0x6f,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* %r = load <8 x i64>, <8 x i64>* %vaddr, align 1 @@ -423,11 +417,11 @@ define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { ; CHECK-LABEL: test40: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2][?:0.000000e+00] ; CHECK-NEXT: vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07] ; CHECK-NEXT: vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04] -; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* %r = load <16 x float>, <16 x float>* %vaddr, align 64 @@ -438,11 +432,11 @@ define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { ; CHECK-LABEL: test41: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2][?:0.000000e+00] ; CHECK-NEXT: vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07] ; CHECK-NEXT: vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04] -; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x10,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x10,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* %r = load <16 x float>, <16 x float>* %vaddr, align 1 @@ -453,11 +447,11 @@ define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) { ; CHECK-LABEL: test42: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9] +; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9][?:0.000000e+00] ; CHECK-NEXT: vcmpordps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x07] ; CHECK-NEXT: vcmpneqps %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc2,0xc9,0x04] -; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* %r = load <16 x float>, <16 x float>* %vaddr, align 64 @@ -468,11 +462,11 @@ define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) { ; CHECK-LABEL: test43: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9] +; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9][?:0.000000e+00] ; CHECK-NEXT: vcmpordps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x07] ; CHECK-NEXT: vcmpneqps %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc2,0xc9,0x04] -; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x10,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x10,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* %r = load <16 x float>, <16 x float>* %vaddr, align 1 @@ -483,11 +477,11 @@ define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { ; CHECK-LABEL: test44: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2][?:0.000000e+00] ; CHECK-NEXT: vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07] ; CHECK-NEXT: vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04] -; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x28,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* %r = load <8 x double>, <8 x double>* %vaddr, align 64 @@ -498,11 +492,11 @@ define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { ; CHECK-LABEL: test45: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2] +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2][?:0.000000e+00] ; CHECK-NEXT: vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07] ; CHECK-NEXT: vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04] -; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x10,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x10,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* %r = load <8 x double>, <8 x double>* %vaddr, align 1 @@ -513,11 +507,11 @@ define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) { ; CHECK-LABEL: test46: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9] +; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9][?:0.000000e+00] ; CHECK-NEXT: vcmpordpd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc9,0x07] ; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc2,0xc9,0x04] -; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x28,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* %r = load <8 x double>, <8 x double>* %vaddr, align 64 @@ -528,11 +522,11 @@ define <8 x double> @test47(i8 * %addr, <8 x double> %mask1) { ; CHECK-LABEL: test47: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9] +; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9][?:0.000000e+00] ; CHECK-NEXT: vcmpordpd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc9,0x07] ; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc2,0xc9,0x04] -; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x10,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x10,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* %r = load <8 x double>, <8 x double>* %vaddr, align 1 Index: test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -6,13 +6,13 @@ define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0] +; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0][3:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8] ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] -; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9] -; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9][1:0.50] +; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) %res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) %res2 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> zeroinitializer, i32 %mask) @@ -26,13 +26,13 @@ define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0] +; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8] ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9][1:0.50] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) %res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) %res2 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %mask) @@ -46,13 +46,13 @@ define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0] +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0][3:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8] ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] -; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9] -; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9][1:0.50] +; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) %res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) %res2 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> zeroinitializer, i16 %mask) @@ -66,13 +66,13 @@ define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0] +; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8] ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9][1:0.50] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) %res2 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %mask) @@ -92,7 +92,7 @@ ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] ; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9] ; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) %res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) %res2 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> zeroinitializer, i64 %mask) @@ -112,7 +112,7 @@ ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] ; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9] ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) %res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) %res2 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> zeroinitializer, i32 %mask) @@ -128,8 +128,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca] ; CHECK-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x7f,0x07] -; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.storeu.b.128(i8* %ptr1, <16 x i8> %x1, i16 %x2) call void @llvm.x86.avx512.mask.storeu.b.128(i8* %ptr2, <16 x i8> %x1, i16 -1) ret void @@ -142,8 +142,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca] ; CHECK-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x7f,0x07] -; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr1, <32 x i8> %x1, i32 %x2) call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr2, <32 x i8> %x1, i32 -1) ret void @@ -156,8 +156,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca] ; CHECK-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x7f,0x07] -; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.storeu.w.128(i8* %ptr1, <8 x i16> %x1, i8 %x2) call void @llvm.x86.avx512.mask.storeu.w.128(i8* %ptr2, <8 x i16> %x1, i8 -1) ret void @@ -170,8 +170,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca] ; CHECK-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x7f,0x07] -; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr1, <16 x i16> %x1, i16 %x2) call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr2, <16 x i16> %x1, i16 -1) ret void @@ -182,12 +182,12 @@ define <8 x i16>@test_int_x86_avx512_mask_loadu_w_128(i8* %ptr, i8* %ptr2, <8 x i16> %x1, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07][4:0.50] ; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca] -; CHECK-NEXT: vmovdqu16 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x06] -; CHECK-NEXT: vmovdqu16 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x6f,0x0f] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu16 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x06][4:0.50] +; CHECK-NEXT: vmovdqu16 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> %x1, i8 -1) %res = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr2, <8 x i16> %res0, i8 %mask) %res1 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> zeroinitializer, i8 %mask) @@ -200,12 +200,12 @@ define <16 x i16>@test_int_x86_avx512_mask_loadu_w_256(i8* %ptr, i8* %ptr2, <16 x i16> %x1, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07][4:0.50] ; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca] -; CHECK-NEXT: vmovdqu16 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x06] -; CHECK-NEXT: vmovdqu16 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x0f] -; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu16 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x06][4:0.50] +; CHECK-NEXT: vmovdqu16 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> %x1, i16 -1) %res = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr2, <16 x i16> %res0, i16 %mask) %res1 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> zeroinitializer, i16 %mask) @@ -218,12 +218,12 @@ define <16 x i8>@test_int_x86_avx512_mask_loadu_b_128(i8* %ptr, i8* %ptr2, <16 x i8> %x1, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07][4:0.50] ; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca] -; CHECK-NEXT: vmovdqu8 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x06] -; CHECK-NEXT: vmovdqu8 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x0f] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu8 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x06][4:0.50] +; CHECK-NEXT: vmovdqu8 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> %x1, i16 -1) %res = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr2, <16 x i8> %res0, i16 %mask) %res1 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> zeroinitializer, i16 %mask) @@ -236,12 +236,12 @@ define <32 x i8>@test_int_x86_avx512_mask_loadu_b_256(i8* %ptr, i8* %ptr2, <32 x i8> %x1, i32 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07][4:0.50] ; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca] -; CHECK-NEXT: vmovdqu8 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x06] -; CHECK-NEXT: vmovdqu8 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x0f] -; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu8 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x06][4:0.50] +; CHECK-NEXT: vmovdqu8 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> %x1, i32 -1) %res = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr2, <32 x i8> %res0, i32 %mask) %res1 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> zeroinitializer, i32 %mask) @@ -254,16 +254,13 @@ define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3, i16 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_palignr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x02] -; CHECK-NEXT: ## xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] +; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x02]xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x0f,0xd1,0x02] -; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] -; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] -; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] -; CHECK-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x0f,0xd1,0x02]xmm2 {%k1} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] +; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02]xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0][1:0.50] +; CHECK-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4) %res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4) %res2 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 -1) @@ -277,16 +274,13 @@ define <32 x i8>@test_int_x86_avx512_mask_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3, i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_palignr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x0f,0xd9,0x02] -; CHECK-NEXT: ## ymm3 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] +; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x0f,0xd9,0x02]ymm3 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x0f,0xd1,0x02] -; CHECK-NEXT: ## ymm2 {%k1} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] -; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02] -; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] -; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] -; CHECK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x0f,0xd1,0x02]ymm2 {%k1} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] +; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02]ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] +; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0][1:0.50] +; CHECK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4) %res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4) %res2 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 -1) @@ -300,16 +294,13 @@ define <8 x i16>@test_int_x86_avx512_mask_pshufh_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x70,0xd0,0x03] -; CHECK-NEXT: ## xmm2 = xmm0[0,1,2,3,7,4,4,4] +; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x70,0xd0,0x03]xmm2 = xmm0[0,1,2,3,7,4,4,4][1:1.00] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x70,0xc8,0x03] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,1,2,3,7,4,4,4] -; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x70,0xc8,0x03]xmm1 {%k1} = xmm0[0,1,2,3,7,4,4,4] +; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03]xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) @@ -323,16 +314,13 @@ define <16 x i16>@test_int_x86_avx512_mask_pshufh_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x70,0xd0,0x03] -; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] +; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x70,0xd0,0x03]ymm2 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12][1:1.00] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x70,0xc8,0x03] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] -; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03] -; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x70,0xc8,0x03]ymm1 {%k1} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] +; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03]ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1) @@ -346,16 +334,13 @@ define <8 x i16>@test_int_x86_avx512_mask_pshufl_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x70,0xd0,0x03] -; CHECK-NEXT: ## xmm2 = xmm0[3,0,0,0,4,5,6,7] +; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x70,0xd0,0x03]xmm2 = xmm0[3,0,0,0,4,5,6,7][1:1.00] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x70,0xc8,0x03] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[3,0,0,0,4,5,6,7] -; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x70,0xc8,0x03]xmm1 {%k1} = xmm0[3,0,0,0,4,5,6,7] +; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03]xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) @@ -369,16 +354,13 @@ define <16 x i16>@test_int_x86_avx512_mask_pshufl_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0x70,0xd0,0x03] -; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] +; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0x70,0xd0,0x03]ymm2 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15][1:1.00] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x70,0xc8,0x03] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] -; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03] -; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x70,0xc8,0x03]ymm1 {%k1} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] +; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03]ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1) @@ -390,9 +372,9 @@ define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) { ; CHECK-LABEL: test_pcmpeq_b_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] +; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1][?:0.000000e+00] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1) ret i32 %res } @@ -401,9 +383,9 @@ ; CHECK-LABEL: test_mask_pcmpeq_b_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] +; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1][?:0.000000e+00] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) ret i32 %res } @@ -413,10 +395,10 @@ define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: test_pcmpeq_w_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1][?:0.000000e+00] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: ## kill: %AX %AX %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) ret i16 %res } @@ -425,10 +407,10 @@ ; CHECK-LABEL: test_mask_pcmpeq_w_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] +; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1][?:0.000000e+00] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: ## kill: %AX %AX %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) ret i16 %res } @@ -438,9 +420,9 @@ define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) { ; CHECK-LABEL: test_pcmpgt_b_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1] +; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1][?:0.000000e+00] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1) ret i32 %res } @@ -449,9 +431,9 @@ ; CHECK-LABEL: test_mask_pcmpgt_b_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1] +; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1][?:0.000000e+00] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) ret i32 %res } @@ -461,10 +443,10 @@ define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: test_pcmpgt_w_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] +; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1][?:0.000000e+00] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: ## kill: %AX %AX %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) ret i16 %res } @@ -473,10 +455,10 @@ ; CHECK-LABEL: test_mask_pcmpgt_w_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] +; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1][?:0.000000e+00] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: ## kill: %AX %AX %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) ret i16 %res } @@ -488,13 +470,11 @@ define <16 x i8>@test_int_x86_avx512_mask_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x68,0xd9] -; CHECK-NEXT: ## xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x68,0xd9]xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x68,0xd1] -; CHECK-NEXT: ## xmm2 {%k1} = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x68,0xd1]xmm2 {%k1} = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) %res2 = add <16 x i8> %res, %res1 @@ -506,13 +486,11 @@ define <16 x i8>@test_int_x86_avx512_mask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x60,0xd9] -; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x60,0xd9]xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x60,0xd1] -; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x60,0xd1]xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) %res2 = add <16 x i8> %res, %res1 @@ -524,13 +502,11 @@ define <32 x i8>@test_int_x86_avx512_mask_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x68,0xd9] -; CHECK-NEXT: ## ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x68,0xd9]ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x68,0xd1] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x68,0xd1]ymm2 {%k1} = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) %res2 = add <32 x i8> %res, %res1 @@ -542,13 +518,11 @@ define <32 x i8>@test_int_x86_avx512_mask_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x60,0xd9] -; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x60,0xd9]ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x60,0xd1] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x60,0xd1]ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) %res2 = add <32 x i8> %res, %res1 @@ -560,13 +534,11 @@ define <8 x i16>@test_int_x86_avx512_mask_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x61,0xd9] -; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x61,0xd9]xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x61,0xd1] -; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x61,0xd1]xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) %res2 = add <8 x i16> %res, %res1 @@ -578,13 +550,11 @@ define <8 x i16>@test_int_x86_avx512_mask_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x69,0xd9] -; CHECK-NEXT: ## xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x69,0xd9]xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x69,0xd1] -; CHECK-NEXT: ## xmm2 {%k1} = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x69,0xd1]xmm2 {%k1} = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) %res2 = add <8 x i16> %res, %res1 @@ -596,13 +566,11 @@ define <16 x i16>@test_int_x86_avx512_mask_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x61,0xd9] -; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x61,0xd9]ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x61,0xd1] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x61,0xd1]ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) %res2 = add <16 x i16> %res, %res1 @@ -614,13 +582,11 @@ define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x69,0xd9] -; CHECK-NEXT: ## ymm3 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x69,0xd9]ymm3 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x69,0xd1] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x69,0xd1]ymm2 {%k1} = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) %res2 = add <16 x i16> %res, %res1 @@ -630,8 +596,8 @@ define <8 x i16> @test_mask_add_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_mask_add_epi16_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) ret <8 x i16> %res } @@ -641,8 +607,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) ret <8 x i16> %res } @@ -652,7 +618,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) ret <8 x i16> %res } @@ -660,8 +626,8 @@ define <8 x i16> @test_mask_add_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { ; CHECK-LABEL: test_mask_add_epi16_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) ret <8 x i16> %res @@ -672,8 +638,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) ret <8 x i16> %res @@ -684,7 +650,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) ret <8 x i16> %res @@ -695,8 +661,8 @@ define <16 x i16> @test_mask_add_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: test_mask_add_epi16_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) ret <16 x i16> %res } @@ -706,8 +672,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) ret <16 x i16> %res } @@ -717,7 +683,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) ret <16 x i16> %res } @@ -725,8 +691,8 @@ define <16 x i16> @test_mask_add_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { ; CHECK-LABEL: test_mask_add_epi16_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) ret <16 x i16> %res @@ -737,8 +703,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) ret <16 x i16> %res @@ -749,7 +715,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) ret <16 x i16> %res @@ -760,8 +726,8 @@ define <8 x i16> @test_mask_sub_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_mask_sub_epi16_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) ret <8 x i16> %res } @@ -771,8 +737,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) ret <8 x i16> %res } @@ -782,7 +748,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) ret <8 x i16> %res } @@ -790,8 +756,8 @@ define <8 x i16> @test_mask_sub_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { ; CHECK-LABEL: test_mask_sub_epi16_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsubw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpsubw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) ret <8 x i16> %res @@ -802,8 +768,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsubw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) ret <8 x i16> %res @@ -814,7 +780,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsubw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) ret <8 x i16> %res @@ -825,8 +791,8 @@ define <16 x i16> @test_mask_sub_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: test_mask_sub_epi16_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf9,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) ret <16 x i16> %res } @@ -836,8 +802,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) ret <16 x i16> %res } @@ -847,7 +813,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) ret <16 x i16> %res } @@ -855,8 +821,8 @@ define <16 x i16> @test_mask_sub_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { ; CHECK-LABEL: test_mask_sub_epi16_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsubw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpsubw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf9,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) ret <16 x i16> %res @@ -867,8 +833,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsubw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) ret <16 x i16> %res @@ -879,7 +845,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsubw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) ret <16 x i16> %res @@ -891,7 +857,7 @@ ; CHECK-LABEL: test_mask_add_epi16_rr_512: ; CHECK: ## BB#0: ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } @@ -902,7 +868,7 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0xd1] ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } @@ -912,7 +878,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res } @@ -921,7 +887,7 @@ ; CHECK-LABEL: test_mask_add_epi16_rm_512: ; CHECK: ## BB#0: ; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res @@ -933,7 +899,7 @@ ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0x0f] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res @@ -944,7 +910,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res @@ -956,7 +922,7 @@ ; CHECK-LABEL: test_mask_sub_epi16_rr_512: ; CHECK: ## BB#0: ; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } @@ -967,7 +933,7 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0xd1] ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } @@ -977,7 +943,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res } @@ -986,7 +952,7 @@ ; CHECK-LABEL: test_mask_sub_epi16_rm_512: ; CHECK: ## BB#0: ; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res @@ -998,7 +964,7 @@ ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0x0f] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res @@ -1009,7 +975,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res @@ -1021,7 +987,7 @@ ; CHECK-LABEL: test_mask_mullo_epi16_rr_512: ; CHECK: ## BB#0: ; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } @@ -1032,7 +998,7 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0xd1] ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } @@ -1042,7 +1008,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res } @@ -1051,7 +1017,7 @@ ; CHECK-LABEL: test_mask_mullo_epi16_rm_512: ; CHECK: ## BB#0: ; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res @@ -1063,7 +1029,7 @@ ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0x0f] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res @@ -1074,7 +1040,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res @@ -1085,8 +1051,8 @@ define <8 x i16> @test_mask_mullo_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_mask_mullo_epi16_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd5,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd5,0xc1][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) ret <8 x i16> %res } @@ -1096,8 +1062,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) ret <8 x i16> %res } @@ -1107,7 +1073,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) ret <8 x i16> %res } @@ -1115,8 +1081,8 @@ define <8 x i16> @test_mask_mullo_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { ; CHECK-LABEL: test_mask_mullo_epi16_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmullw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd5,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmullw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd5,0x07][9:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1) ret <8 x i16> %res @@ -1127,8 +1093,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpmullw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) ret <8 x i16> %res @@ -1139,7 +1105,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpmullw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask) ret <8 x i16> %res @@ -1150,8 +1116,8 @@ define <16 x i16> @test_mask_mullo_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: test_mask_mullo_epi16_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd5,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd5,0xc1][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) ret <16 x i16> %res } @@ -1161,8 +1127,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) ret <16 x i16> %res } @@ -1172,7 +1138,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) ret <16 x i16> %res } @@ -1180,8 +1146,8 @@ define <16 x i16> @test_mask_mullo_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { ; CHECK-LABEL: test_mask_mullo_epi16_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmullw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd5,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmullw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd5,0x07][9:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1) ret <16 x i16> %res @@ -1192,8 +1158,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpmullw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) ret <16 x i16> %res @@ -1204,7 +1170,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpmullw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask) ret <16 x i16> %res @@ -1220,8 +1186,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3c,0xd1] ; CHECK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3c,0xc1] -; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2 ,i16 %mask) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) %res2 = add <16 x i8> %res, %res1 @@ -1233,11 +1199,11 @@ define <32 x i8>@test_int_x86_avx512_mask_pmaxs_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_b_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xd9] +; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xd9][1:0.50] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3c,0xd1] -; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) %res2 = add <32 x i8> %res, %res1 @@ -1249,11 +1215,11 @@ define <8 x i16>@test_int_x86_avx512_mask_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0xd9] +; CHECK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0xd9][1:0.50] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xee,0xd1] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) %res2 = add <8 x i16> %res, %res1 @@ -1268,8 +1234,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xee,0xd1] ; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xee,0xc1] -; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) %res2 = add <16 x i16> %res, %res1 @@ -1284,8 +1250,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmaxub %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xde,0xd1] ; CHECK-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xde,0xc1] -; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) %res2 = add <16 x i8> %res, %res1 @@ -1297,11 +1263,11 @@ define <32 x i8>@test_int_x86_avx512_mask_pmaxu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_b_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xd9] +; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xd9][1:0.50] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xde,0xd1] -; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) %res2 = add <32 x i8> %res, %res1 @@ -1313,11 +1279,11 @@ define <8 x i16>@test_int_x86_avx512_mask_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xd9] +; CHECK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xd9][1:0.50] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3e,0xd1] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) %res2 = add <8 x i16> %res, %res1 @@ -1332,8 +1298,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3e,0xd1] ; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x3e,0xc1] -; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) %res2 = add <16 x i16> %res, %res1 @@ -1348,8 +1314,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x38,0xd1] ; CHECK-NEXT: vpminsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x38,0xc1] -; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) %res2 = add <16 x i8> %res, %res1 @@ -1361,11 +1327,11 @@ define <32 x i8>@test_int_x86_avx512_mask_pmins_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmins_b_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xd9] +; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xd9][1:0.50] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x38,0xd1] -; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) %res2 = add <32 x i8> %res, %res1 @@ -1377,11 +1343,11 @@ define <8 x i16>@test_int_x86_avx512_mask_pmins_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmins_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0xd9] +; CHECK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0xd9][1:0.50] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xea,0xd1] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) %res2 = add <8 x i16> %res, %res1 @@ -1396,8 +1362,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xea,0xd1] ; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xea,0xc1] -; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) %res2 = add <16 x i16> %res, %res1 @@ -1412,8 +1378,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpminub %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xda,0xd1] ; CHECK-NEXT: vpminub %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xda,0xc1] -; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) %res2 = add <16 x i8> %res, %res1 @@ -1425,11 +1391,11 @@ define <32 x i8>@test_int_x86_avx512_mask_pminu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pminu_b_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xd9] +; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xd9][1:0.50] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xda,0xd1] -; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) %res1 = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) %res2 = add <32 x i8> %res, %res1 @@ -1441,11 +1407,11 @@ define <8 x i16>@test_int_x86_avx512_mask_pminu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pminu_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpminuw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xd9] +; CHECK-NEXT: vpminuw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xd9][1:0.50] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3a,0xd1] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) %res2 = add <8 x i16> %res, %res1 @@ -1460,8 +1426,8 @@ ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3a,0xd1] ; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x3a,0xc1] -; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) %res2 = add <16 x i16> %res, %res1 @@ -1473,13 +1439,13 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9] +; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9][2:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb][1:0.50] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3) @@ -1493,13 +1459,13 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9] +; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9][2:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1] -; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb][1:0.50] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1) %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3) @@ -1513,13 +1479,13 @@ define <8 x i16>@test_int_x86_avx512_mask_psra_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe1,0xd9] +; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe1,0xd9][2:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe1,0xd1] ; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe1,0xc1] -; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] -; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3) %res2 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) @@ -1533,13 +1499,13 @@ define <16 x i16>@test_int_x86_avx512_mask_psra_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xd9] +; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xd9][2:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe1,0xd1] ; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe1,0xc1] -; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] -; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3) %res2 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1) @@ -1553,13 +1519,13 @@ define <8 x i16>@test_int_x86_avx512_mask_psll_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf1,0xd9] +; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf1,0xd9][2:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf1,0xd1] ; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf1,0xc1] -; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] -; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3) %res2 = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) @@ -1573,13 +1539,13 @@ define <16 x i16>@test_int_x86_avx512_mask_psll_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xd9] +; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xd9][2:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf1,0xd1] ; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf1,0xc1] -; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] -; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3) %res2 = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1) @@ -1593,13 +1559,13 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03] +; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03][1:1.00] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03] ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] -; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca][1:0.50] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) @@ -1613,13 +1579,13 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03] +; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03][1:1.00] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03] ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] -; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca][1:0.50] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1) %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) @@ -1633,13 +1599,13 @@ define <8 x i16>@test_int_x86_avx512_mask_psra_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsraw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xe0,0x03] +; CHECK-NEXT: vpsraw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xe0,0x03][1:1.00] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsraw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xe0,0x03] ; CHECK-NEXT: vpsraw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xe0,0x03] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) %res2 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) @@ -1653,13 +1619,13 @@ define <16 x i16>@test_int_x86_avx512_mask_psra_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsraw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xe0,0x03] +; CHECK-NEXT: vpsraw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xe0,0x03][1:1.00] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsraw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xe0,0x03] ; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xe0,0x03] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) %res2 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1) @@ -1673,13 +1639,13 @@ define <8 x i16>@test_int_x86_avx512_mask_psll_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsllw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xf0,0x03] +; CHECK-NEXT: vpsllw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xf0,0x03][1:1.00] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsllw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xf0,0x03] ; CHECK-NEXT: vpsllw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xf0,0x03] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) %res2 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) @@ -1693,13 +1659,13 @@ define <16 x i16>@test_int_x86_avx512_mask_psll_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsllw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xf0,0x03] +; CHECK-NEXT: vpsllw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xf0,0x03][1:1.00] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsllw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xf0,0x03] ; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xf0,0x03] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) %res2 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1) @@ -1713,11 +1679,11 @@ define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_b_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xd9] +; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xd9][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x00,0xd1] -; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) %res2 = add <16 x i8> %res, %res1 @@ -1729,11 +1695,11 @@ define <32 x i8>@test_int_x86_avx512_mask_pshuf_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_b_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xd9] +; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xd9][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x00,0xd1] -; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) %res1 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) %res2 = add <32 x i8> %res, %res1 @@ -1745,16 +1711,13 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxbw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xd0] -; CHECK-NEXT: ## xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: vpmovzxbw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xd0]xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero[1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpmovzxbw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x30,0xc8] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: vpmovzxbw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x30,0xc0] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmovzxbw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x30,0xc8]xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: vpmovzxbw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x30,0xc0]xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1) @@ -1768,16 +1731,13 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovzxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_w_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxbw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xd0] -; CHECK-NEXT: ## ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; CHECK-NEXT: vpmovzxbw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xd0]ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero[3:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpmovzxbw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x30,0xc8] -; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; CHECK-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0] -; CHECK-NEXT: ## ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmovzxbw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x30,0xc8]ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; CHECK-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0]ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2) %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1) @@ -1792,13 +1752,13 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovsxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0xd0] +; CHECK-NEXT: vpmovsxbw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0xd0][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmovsxbw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x20,0xc8] ; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1) @@ -1812,13 +1772,13 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovsxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xd0] +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xd0][3:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x20,0xc8] ; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0][1:0.50] +; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2) %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1) @@ -1832,13 +1792,13 @@ define <2 x i64>@test_int_x86_avx512_mask_pmovsxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxdq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x25,0xd0] +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x25,0xd0][1:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x25,0xc8] ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x25,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1) @@ -1852,13 +1812,13 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovsxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxdq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x25,0xd0] +; CHECK-NEXT: vpmovsxdq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x25,0xd0][3:1.00] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmovsxdq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x25,0xc8] ; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1) @@ -1870,8 +1830,8 @@ define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_mask_packs_epi32_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1) ret <8 x i16> %res } @@ -1881,8 +1841,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) ret <8 x i16> %res } @@ -1892,7 +1852,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask) ret <8 x i16> %res } @@ -1900,8 +1860,8 @@ define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_packs_epi32_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x07][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1) ret <8 x i16> %res @@ -1912,8 +1872,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) ret <8 x i16> %res @@ -1924,7 +1884,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask) ret <8 x i16> %res @@ -1934,7 +1894,7 @@ ; CHECK-LABEL: test_mask_packs_epi32_rmb_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -1947,8 +1907,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -1961,7 +1921,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -1974,8 +1934,8 @@ define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_mask_packs_epi32_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1) ret <16 x i16> %res } @@ -1985,8 +1945,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) ret <16 x i16> %res } @@ -1996,7 +1956,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask) ret <16 x i16> %res } @@ -2004,8 +1964,8 @@ define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_packs_epi32_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x07][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1) ret <16 x i16> %res @@ -2016,8 +1976,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) ret <16 x i16> %res @@ -2028,7 +1988,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask) ret <16 x i16> %res @@ -2038,7 +1998,7 @@ ; CHECK-LABEL: test_mask_packs_epi32_rmb_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2051,8 +2011,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2065,7 +2025,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2078,8 +2038,8 @@ define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_mask_packs_epi16_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1) ret <16 x i8> %res } @@ -2089,8 +2049,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) ret <16 x i8> %res } @@ -2100,7 +2060,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask) ret <16 x i8> %res } @@ -2108,8 +2068,8 @@ define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { ; CHECK-LABEL: test_mask_packs_epi16_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0x07][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1) ret <16 x i8> %res @@ -2120,8 +2080,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) ret <16 x i8> %res @@ -2132,7 +2092,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask) ret <16 x i8> %res @@ -2143,8 +2103,8 @@ define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: test_mask_packs_epi16_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1) ret <32 x i8> %res } @@ -2154,8 +2114,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) ret <32 x i8> %res } @@ -2165,7 +2125,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask) ret <32 x i8> %res } @@ -2173,8 +2133,8 @@ define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { ; CHECK-LABEL: test_mask_packs_epi16_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0x07][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1) ret <32 x i8> %res @@ -2185,8 +2145,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) ret <32 x i8> %res @@ -2197,7 +2157,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask) ret <32 x i8> %res @@ -2209,8 +2169,8 @@ define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_mask_packus_epi32_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1) ret <8 x i16> %res } @@ -2220,8 +2180,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) ret <8 x i16> %res } @@ -2231,7 +2191,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask) ret <8 x i16> %res } @@ -2239,8 +2199,8 @@ define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_packus_epi32_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x07][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1) ret <8 x i16> %res @@ -2251,8 +2211,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) ret <8 x i16> %res @@ -2263,7 +2223,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask) ret <8 x i16> %res @@ -2273,7 +2233,7 @@ ; CHECK-LABEL: test_mask_packus_epi32_rmb_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2286,8 +2246,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2300,7 +2260,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2313,8 +2273,8 @@ define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_mask_packus_epi32_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1) ret <16 x i16> %res } @@ -2324,8 +2284,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) ret <16 x i16> %res } @@ -2335,7 +2295,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask) ret <16 x i16> %res } @@ -2343,8 +2303,8 @@ define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_packus_epi32_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x07][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1) ret <16 x i16> %res @@ -2355,8 +2315,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) ret <16 x i16> %res @@ -2367,7 +2327,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask) ret <16 x i16> %res @@ -2377,7 +2337,7 @@ ; CHECK-LABEL: test_mask_packus_epi32_rmb_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x38,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2390,8 +2350,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2404,7 +2364,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2417,8 +2377,8 @@ define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_mask_packus_epi16_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1) ret <16 x i8> %res } @@ -2428,8 +2388,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) ret <16 x i8> %res } @@ -2439,7 +2399,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask) ret <16 x i8> %res } @@ -2447,8 +2407,8 @@ define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { ; CHECK-LABEL: test_mask_packus_epi16_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0x07][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1) ret <16 x i8> %res @@ -2459,8 +2419,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) ret <16 x i8> %res @@ -2471,7 +2431,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i16>, <8 x i16>* %ptr_b %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask) ret <16 x i8> %res @@ -2482,8 +2442,8 @@ define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: test_mask_packus_epi16_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1) ret <32 x i8> %res } @@ -2493,8 +2453,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) ret <32 x i8> %res } @@ -2504,7 +2464,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask) ret <32 x i8> %res } @@ -2512,8 +2472,8 @@ define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { ; CHECK-LABEL: test_mask_packus_epi16_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0x07][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1) ret <32 x i8> %res @@ -2524,8 +2484,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) ret <32 x i8> %res @@ -2536,7 +2496,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <16 x i16>, <16 x i16>* %ptr_b %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask) ret <32 x i8> %res Index: test/CodeGen/X86/avx512dqvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -9,8 +9,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7b,0xc8] ; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x7b,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -25,8 +25,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7b,0xc8] ; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x7b,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -41,8 +41,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x79,0xc8] ; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x79,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -57,8 +57,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x79,0xc8] ; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x79,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -73,8 +73,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtps2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7b,0xc8] ; CHECK-NEXT: vcvtps2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x7b,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -89,8 +89,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtps2qq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7b,0xc8] ; CHECK-NEXT: vcvtps2qq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x7b,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -105,8 +105,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x79,0xc8] ; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x79,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -121,8 +121,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x79,0xc8] ; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x79,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -137,8 +137,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0xe6,0xc8] ; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0xe6,0xc0] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -153,8 +153,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0xe6,0xc8] ; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0xe6,0xc0] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -169,8 +169,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] ; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -182,11 +182,10 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] -; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9] -; CHECK-NEXT: ## xmm1 = xmm1[0],zero +; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]xmm1 = xmm1[0],zero[1:0.33] ; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) @@ -203,8 +202,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8] ; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -219,8 +218,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7a,0xc8] ; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x7a,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -235,8 +234,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7a,0xc8] ; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x7a,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -251,8 +250,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x78,0xc8] ; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x78,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -267,8 +266,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x78,0xc8] ; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x78,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -283,8 +282,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7a,0xc8] ; CHECK-NEXT: vcvttps2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x7a,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -299,8 +298,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7a,0xc8] ; CHECK-NEXT: vcvttps2qq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x7a,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -315,8 +314,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x7a,0xc8] ; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7a,0xc0] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -331,8 +330,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x7a,0xc8] ; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0x7a,0xc0] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -347,8 +346,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] ; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -360,11 +359,10 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] -; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9] -; CHECK-NEXT: ## xmm1 = xmm1[0],zero +; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]xmm1 = xmm1[0],zero[1:0.33] ; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) @@ -381,8 +379,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8] ; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x28,0x7a,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -397,8 +395,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x78,0xc8] ; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x78,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -413,8 +411,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x78,0xc8] ; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x78,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -429,8 +427,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vreducepd $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x56,0xc8,0x04] ; CHECK-NEXT: vreducepd $8, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x56,0xc0,0x08] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double> %x0, i32 8, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -445,8 +443,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vreducepd $4, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x56,0xc8,0x04] ; CHECK-NEXT: vreducepd $0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x56,0xc0,0x00] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double> %x0, i32 0, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -461,8 +459,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vreduceps $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x56,0xc8,0x04] ; CHECK-NEXT: vreduceps $88, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x56,0xc0,0x58] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float> %x0, i32 4, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -477,8 +475,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vreduceps $11, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x56,0xc8,0x0b] ; CHECK-NEXT: vreduceps $11, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x56,0xc0,0x0b] -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -493,8 +491,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrangepd $4, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x50,0xd1,0x04] ; CHECK-NEXT: vrangepd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x50,0xc1,0x08] -; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %x0, <2 x double> %x1, i32 4, <2 x double> %x3, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %x0, <2 x double> %x1, i32 8, <2 x double> %x3, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -509,8 +507,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrangepd $4, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x50,0xd1,0x04] ; CHECK-NEXT: vrangepd $88, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x50,0xc1,0x58] -; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %x0, <4 x double> %x1, i32 4, <4 x double> %x3, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %x0, <4 x double> %x1, i32 88, <4 x double> %x3, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -525,8 +523,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrangeps $4, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x50,0xd1,0x04] ; CHECK-NEXT: vrangeps $88, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x50,0xc1,0x58] -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %x0, <4 x float> %x1, i32 4, <4 x float> %x3, i8 %x4) %res1 = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %x0, <4 x float> %x1, i32 88, <4 x float> %x3, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -541,8 +539,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrangeps $4, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x50,0xd1,0x04] ; CHECK-NEXT: vrangeps $88, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x50,0xc1,0x58] -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %x0, <8 x float> %x1, i32 4, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %x0, <8 x float> %x1, i32 88, <8 x float> %x3, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -559,9 +557,9 @@ ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: vfpclassps $4, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x66,0xc0,0x04] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8][1:0.25] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 2, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 4, i8 -1) %res2 = add i8 %res, %res1 @@ -578,9 +576,9 @@ ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: vfpclassps $4, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x66,0xc0,0x04] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8][1:0.25] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 2, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 4, i8 -1) %res2 = add i8 %res, %res1 @@ -597,9 +595,9 @@ ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: vfpclasspd $2, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x66,0xc0,0x02] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8][1:0.25] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 4, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 2, i8 -1) %res2 = add i8 %res, %res1 @@ -616,9 +614,9 @@ ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: vfpclasspd $4, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x66,0xc0,0x04] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8][1:0.25] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 2, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 4, i8 -1) %res2 = add i8 %res, %res1 @@ -631,14 +629,12 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x2_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x19,0xc8] -; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0] -; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x19,0xc8]ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0]ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0][3:1.00] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca][3:1.00] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %x3) %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 -1) @@ -653,14 +649,12 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vbroadcasti32x2 (%rsi), %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x59,0x0e] -; CHECK-NEXT: ## ymm1 {%k1} = mem[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x59,0xd0] -; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vbroadcasti32x2 (%rsi), %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x59,0x0e]ymm1 {%k1} = mem[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x59,0xd0]ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc0][3:1.00] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %y_64 = load i64, i64 * %y_ptr %y_v2i64 = insertelement <2 x i64> undef, i64 %y_64, i32 0 %y = bitcast <2 x i64> %y_v2i64 to <4 x i32> @@ -680,10 +674,10 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0] -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0][1:1.00] +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca][1:0.50] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 -1) @@ -700,7 +694,7 @@ ; CHECK-NEXT: vpmovd2m %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x39,0xc0] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32> %x0) ret i8 %res } @@ -713,7 +707,7 @@ ; CHECK-NEXT: vpmovd2m %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x39,0xc0] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32> %x0) ret i8 %res } @@ -726,7 +720,7 @@ ; CHECK-NEXT: vpmovq2m %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x39,0xc0] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64> %x0) ret i8 %res } @@ -739,7 +733,7 @@ ; CHECK-NEXT: vpmovq2m %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x39,0xc0] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64> %x0) ret i8 %res } @@ -751,7 +745,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7] ; CHECK-NEXT: vpmovm2d %k0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8 %x0) ret <4 x i32> %res } @@ -763,7 +757,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7] ; CHECK-NEXT: vpmovm2d %k0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x38,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.cvtmask2d.256(i8 %x0) ret <8 x i32> %res } @@ -775,7 +769,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7] ; CHECK-NEXT: vpmovm2q %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x38,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.cvtmask2q.128(i8 %x0) ret <2 x i64> %res } @@ -787,7 +781,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7] ; CHECK-NEXT: vpmovm2q %k0, %ymm0 ## encoding: [0x62,0xf2,0xfe,0x28,0x38,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8 %x0) ret <4 x i64> %res } @@ -799,15 +793,12 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd0,0x00] -; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,0,1] -; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xc8,0x00] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,0,1] -; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc0,0x00] -; CHECK-NEXT: ## ymm0 = ymm0[0,1,0,1] -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] -; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd0,0x00]ymm2 {%k1} {z} = ymm0[0,1,0,1] +; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xc8,0x00]ymm1 {%k1} = ymm0[0,1,0,1] +; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc0,0x00]ymm0 = ymm0[0,1,0,1] +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1][3:1.00] +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 -1) %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask) @@ -821,10 +812,9 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256_load: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovapd (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x0f] -; CHECK-NEXT: vshuff64x2 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x23,0xc1,0x00] -; CHECK-NEXT: ## ymm0 {%k1} = ymm1[0,1,0,1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovapd (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x0f][4:0.50] +; CHECK-NEXT: vshuff64x2 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x23,0xc1,0x00]ymm0 {%k1} = ymm1[0,1,0,1] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %x0 = load <2 x double>, <2 x double>* %x0ptr %res = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask) @@ -838,15 +828,12 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x43,0xd0,0x00] -; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,0,1] -; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xc8,0x00] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,0,1] -; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc0,0x00] -; CHECK-NEXT: ## ymm0 = ymm0[0,1,0,1] -; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x43,0xd0,0x00]ymm2 {%k1} {z} = ymm0[0,1,0,1] +; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xc8,0x00]ymm1 {%k1} = ymm0[0,1,0,1] +; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc0,0x00]ymm0 = ymm0[0,1,0,1] +; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1][1:0.50] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res1 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 -1) %res2 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) @@ -860,10 +847,9 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256_load: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f] -; CHECK-NEXT: vshufi64x2 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x43,0xc1,0x00] -; CHECK-NEXT: ## ymm0 {%k1} = ymm1[0,1,0,1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vshufi64x2 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x43,0xc1,0x00]ymm0 {%k1} = ymm1[0,1,0,1] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %x0 = load <2 x i64>, <2 x i64>* %x0ptr %res = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) Index: test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -10,8 +10,8 @@ ; CHECK-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x58,0xc8] ; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x58,0xc0] ; CHECK-NEXT: vpaddd (%rsi){1to8}, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x38,0xfe,0x0e] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %y_32 = load i32, i32 * %y_ptr %y = insertelement <4 x i32> undef, i32 %y_32, i32 0 %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %y, <8 x i32> %x1, i8 -1) @@ -27,13 +27,13 @@ define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0] +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] -; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9][1:0.50] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) %res1 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) %res2 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %mask) @@ -47,13 +47,13 @@ define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0] +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0][3:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] -; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9] -; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9][1:0.50] +; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) %res1 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 %mask) %res2 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> zeroinitializer,i8 %mask) @@ -67,13 +67,13 @@ define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] -; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9] -; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9][1:0.50] +; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) %res1 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 %mask) %res2 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> zeroinitializer,i8 %mask) @@ -87,13 +87,13 @@ define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) { ; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0][3:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] -; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9][3:1.00] +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask) %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask) @@ -107,13 +107,13 @@ define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) { ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0] +; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0][3:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] -; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9][3:1.00] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask) %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask) @@ -127,13 +127,13 @@ define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) { ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0] +; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] -; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9] -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9][3:1.00] +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) @@ -147,16 +147,13 @@ define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovsldup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x12,0xd0] -; CHECK-NEXT: ## xmm2 = xmm0[0,0,2,2] +; CHECK-NEXT: vmovsldup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x12,0xd0]xmm2 = xmm0[0,0,2,2][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2] -; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8]xmm1 {%k1} = xmm0[0,0,2,2] +; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0]xmm0 {%k1} {z} = xmm0[0,0,2,2] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca][3:1.00] +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) %res1 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 -1) %res2 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2) @@ -170,16 +167,13 @@ define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovsldup %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x12,0xd0] -; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vmovsldup %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x12,0xd0]ymm2 = ymm0[0,0,2,2,4,4,6,6][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0] -; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8]ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0]ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca][3:1.00] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) %res1 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 -1) %res2 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2) @@ -193,16 +187,13 @@ define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovshdup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd0] -; CHECK-NEXT: ## xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd0]xmm2 = xmm0[1,1,3,3][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3] -; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8]xmm1 {%k1} = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0]xmm0 {%k1} {z} = xmm0[1,1,3,3] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca][3:1.00] +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) %res1 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 -1) %res2 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2) @@ -216,16 +207,13 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovshdup %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x16,0xd0] -; CHECK-NEXT: ## ymm2 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vmovshdup %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x16,0xd0]ymm2 = ymm0[1,1,3,3,5,5,7,7][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0] -; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8]ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0]ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca][3:1.00] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) %res1 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 -1) %res2 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2) @@ -238,16 +226,13 @@ define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovddup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xd0] -; CHECK-NEXT: ## xmm2 = xmm0[0,0] +; CHECK-NEXT: vmovddup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xd0]xmm2 = xmm0[0,0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0] -; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0] -; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8]xmm1 {%k1} = xmm0[0,0] +; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0]xmm0 {%k1} {z} = xmm0[0,0] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca][3:1.00] +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1) %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2) @@ -261,16 +246,13 @@ define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovddup %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0x12,0xd0] -; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2] +; CHECK-NEXT: vmovddup %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0x12,0xd0]ymm2 = ymm0[0,0,2,2][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2] -; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] -; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2] -; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8]ymm1 {%k1} = ymm0[0,0,2,2] +; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0]ymm0 {%k1} {z} = ymm0[0,0,2,2] +; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca][3:1.00] +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1) %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2) @@ -284,16 +266,13 @@ define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xd0,0x06] -; CHECK-NEXT: ## ymm2 = ymm0[0,1,3,2] +; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xd0,0x06]ymm2 = ymm0[0,1,3,2][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x05,0xc8,0x06] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,3,2] -; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x05,0xc0,0x06] -; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,1,3,2] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x05,0xc8,0x06]ymm1 {%k1} = ymm0[0,1,3,2] +; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x05,0xc0,0x06]ymm0 {%k1} {z} = ymm0[0,1,3,2] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> zeroinitializer, i8 %x3) %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 -1) @@ -307,16 +286,13 @@ define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xd0,0x01] -; CHECK-NEXT: ## xmm2 = xmm0[1,0] +; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xd0,0x01]xmm2 = xmm0[1,0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x05,0xc8,0x01] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,0] -; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0x89,0x05,0xc0,0x01] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,0] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x05,0xc8,0x01]xmm1 {%k1} = xmm0[1,0] +; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0x89,0x05,0xc0,0x01]xmm0 {%k1} {z} = xmm0[1,0] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> zeroinitializer, i8 %x3) %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 -1) @@ -330,16 +306,13 @@ define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermilps $22, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xd0,0x16] -; CHECK-NEXT: ## ymm2 = ymm0[2,1,1,0,6,5,5,4] +; CHECK-NEXT: vpermilps $22, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xd0,0x16]ymm2 = ymm0[2,1,1,0,6,5,5,4][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x04,0xc8,0x16] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[2,1,1,0,6,5,5,4] -; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16] -; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4] -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x04,0xc8,0x16]ymm1 {%k1} = ymm0[2,1,1,0,6,5,5,4] +; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16]ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3) %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1) @@ -353,16 +326,13 @@ define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermilps $22, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xd0,0x16] -; CHECK-NEXT: ## xmm2 = xmm0[2,1,1,0] +; CHECK-NEXT: vpermilps $22, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xd0,0x16]xmm2 = xmm0[2,1,1,0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x04,0xc8,0x16] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[2,1,1,0] -; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[2,1,1,0] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x04,0xc8,0x16]xmm1 {%k1} = xmm0[2,1,1,0] +; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16]xmm0 {%k1} {z} = xmm0[2,1,1,0] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3) %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1) @@ -376,16 +346,13 @@ define <4 x double>@test_int_x86_avx512_mask_perm_df_256(<4 x double> %x0, i32 %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermpd $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x01,0xd0,0x03] -; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0] +; CHECK-NEXT: vpermpd $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x01,0xd0,0x03]ymm2 = ymm0[3,0,0,0][3:1.00] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpermpd $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x01,0xc8,0x03] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0] -; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x01,0xc0,0x03] -; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpermpd $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x01,0xc8,0x03]ymm1 {%k1} = ymm0[3,0,0,0] +; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x01,0xc0,0x03]ymm0 {%k1} {z} = ymm0[3,0,0,0] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> zeroinitializer, i8 %x3) %res2 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 -1) @@ -399,16 +366,13 @@ define <4 x i64>@test_int_x86_avx512_mask_perm_di_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x00,0xd0,0x03] -; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0] +; CHECK-NEXT: vpermq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x00,0xd0,0x03]ymm2 = ymm0[3,0,0,0][3:1.00] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpermq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x00,0xc8,0x03] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0] -; CHECK-NEXT: vpermq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x00,0xc0,0x03] -; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpermq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x00,0xc8,0x03]ymm1 {%k1} = ymm0[3,0,0,0] +; CHECK-NEXT: vpermq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x00,0xc0,0x03]ymm0 {%k1} {z} = ymm0[3,0,0,0] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1) @@ -424,8 +388,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovapd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x29,0x07] -; CHECK-NEXT: vmovapd %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x29,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovapd %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x29,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.store.pd.128(i8* %ptr1, <2 x double> %x1, i8 %x2) call void @llvm.x86.avx512.mask.store.pd.128(i8* %ptr2, <2 x double> %x1, i8 -1) ret void @@ -438,8 +402,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovapd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x29,0x07] -; CHECK-NEXT: vmovapd %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x29,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovapd %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x29,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.store.pd.256(i8* %ptr1, <4 x double> %x1, i8 %x2) call void @llvm.x86.avx512.mask.store.pd.256(i8* %ptr2, <4 x double> %x1, i8 -1) ret void @@ -452,8 +416,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovupd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x11,0x07] -; CHECK-NEXT: vmovupd %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x11,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovupd %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x11,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.storeu.pd.128(i8* %ptr1, <2 x double> %x1, i8 %x2) call void @llvm.x86.avx512.mask.storeu.pd.128(i8* %ptr2, <2 x double> %x1, i8 -1) ret void @@ -466,8 +430,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovupd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x11,0x07] -; CHECK-NEXT: vmovupd %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x11,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovupd %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x11,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.storeu.pd.256(i8* %ptr1, <4 x double> %x1, i8 %x2) call void @llvm.x86.avx512.mask.storeu.pd.256(i8* %ptr2, <4 x double> %x1, i8 -1) ret void @@ -480,8 +444,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovaps %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x29,0x07] -; CHECK-NEXT: vmovaps %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.store.ps.128(i8* %ptr1, <4 x float> %x1, i8 %x2) call void @llvm.x86.avx512.mask.store.ps.128(i8* %ptr2, <4 x float> %x1, i8 -1) ret void @@ -494,8 +458,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovaps %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x29,0x07] -; CHECK-NEXT: vmovaps %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.store.ps.256(i8* %ptr1, <8 x float> %x1, i8 %x2) call void @llvm.x86.avx512.mask.store.ps.256(i8* %ptr2, <8 x float> %x1, i8 -1) ret void @@ -508,8 +472,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovups %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x11,0x07] -; CHECK-NEXT: vmovups %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovups %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.storeu.ps.128(i8* %ptr1, <4 x float> %x1, i8 %x2) call void @llvm.x86.avx512.mask.storeu.ps.128(i8* %ptr2, <4 x float> %x1, i8 -1) ret void @@ -522,8 +486,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovups %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x11,0x07] -; CHECK-NEXT: vmovups %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovups %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.storeu.ps.256(i8* %ptr1, <8 x float> %x1, i8 %x2) call void @llvm.x86.avx512.mask.storeu.ps.256(i8* %ptr2, <8 x float> %x1, i8 -1) ret void @@ -536,8 +500,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovdqu64 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x7f,0x07] -; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.storeu.q.128(i8* %ptr1, <2 x i64> %x1, i8 %x2) call void @llvm.x86.avx512.mask.storeu.q.128(i8* %ptr2, <2 x i64> %x1, i8 -1) ret void @@ -550,8 +514,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovdqu64 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x7f,0x07] -; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.storeu.q.256(i8* %ptr1, <4 x i64> %x1, i8 %x2) call void @llvm.x86.avx512.mask.storeu.q.256(i8* %ptr2, <4 x i64> %x1, i8 -1) ret void @@ -564,8 +528,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x7f,0x07] -; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.storeu.d.128(i8* %ptr1, <4 x i32> %x1, i8 %x2) call void @llvm.x86.avx512.mask.storeu.d.128(i8* %ptr2, <4 x i32> %x1, i8 -1) ret void @@ -578,8 +542,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x7f,0x07] -; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.storeu.d.256(i8* %ptr1, <8 x i32> %x1, i8 %x2) call void @llvm.x86.avx512.mask.storeu.d.256(i8* %ptr2, <8 x i32> %x1, i8 -1) ret void @@ -592,8 +556,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovdqa64 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7f,0x07] -; CHECK-NEXT: vmovdqa %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.store.q.128(i8* %ptr1, <2 x i64> %x1, i8 %x2) call void @llvm.x86.avx512.mask.store.q.128(i8* %ptr2, <2 x i64> %x1, i8 -1) ret void @@ -606,8 +570,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovdqa64 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7f,0x07] -; CHECK-NEXT: vmovdqa %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x7f,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x7f,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.store.q.256(i8* %ptr1, <4 x i64> %x1, i8 %x2) call void @llvm.x86.avx512.mask.store.q.256(i8* %ptr2, <4 x i64> %x1, i8 -1) ret void @@ -620,8 +584,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7f,0x07] -; CHECK-NEXT: vmovdqa %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.store.d.128(i8* %ptr1, <4 x i32> %x1, i8 %x2) call void @llvm.x86.avx512.mask.store.d.128(i8* %ptr2, <4 x i32> %x1, i8 -1) ret void @@ -634,8 +598,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] ; CHECK-NEXT: vmovdqa32 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7f,0x07] -; CHECK-NEXT: vmovdqa %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x7f,0x06] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x7f,0x06][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.store.d.256(i8* %ptr1, <8 x i32> %x1, i8 %x2) call void @llvm.x86.avx512.mask.store.d.256(i8* %ptr2, <8 x i32> %x1, i8 -1) ret void @@ -644,12 +608,12 @@ define <8 x float> @test_mask_load_aligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07][4:0.50] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x28,0x07] -; CHECK-NEXT: vmovaps (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x28,0x0f] -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x28,0x07][4:0.50] +; CHECK-NEXT: vmovaps (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x28,0x0f][4:0.50] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1) %res1 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> %res, i8 %mask) %res2 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask) @@ -662,12 +626,12 @@ define <8 x float> @test_mask_load_unaligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_unaligned_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] +; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07][4:0.50] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x10,0x07] -; CHECK-NEXT: vmovups (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x10,0x0f] -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x10,0x07][4:0.50] +; CHECK-NEXT: vmovups (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x10,0x0f][4:0.50] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1) %res1 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> %res, i8 %mask) %res2 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask) @@ -680,12 +644,12 @@ define <4 x double> @test_mask_load_aligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovapd (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0x07] +; CHECK-NEXT: vmovapd (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0x07][4:0.50] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0x07] -; CHECK-NEXT: vmovapd (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x28,0x0f] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0x07][4:0.50] +; CHECK-NEXT: vmovapd (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x28,0x0f][4:0.50] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1) %res1 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> %res, i8 %mask) %res2 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask) @@ -698,12 +662,12 @@ define <4 x double> @test_mask_load_unaligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_unaligned_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovupd (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x10,0x07] +; CHECK-NEXT: vmovupd (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x10,0x07][4:0.50] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x10,0x07] -; CHECK-NEXT: vmovupd (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x10,0x0f] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x10,0x07][4:0.50] +; CHECK-NEXT: vmovupd (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x10,0x0f][4:0.50] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1) %res1 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> %res, i8 %mask) %res2 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask) @@ -716,12 +680,12 @@ define <4 x float> @test_mask_load_aligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07][4:0.50] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07] -; CHECK-NEXT: vmovaps (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x28,0x0f] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07][4:0.50] +; CHECK-NEXT: vmovaps (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x28,0x0f][4:0.50] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1) %res1 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> %res, i8 %mask) %res2 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask) @@ -734,12 +698,12 @@ define <4 x float> @test_mask_load_unaligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_unaligned_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] +; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07][4:0.50] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07] -; CHECK-NEXT: vmovups (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x10,0x0f] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07][4:0.50] +; CHECK-NEXT: vmovups (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x10,0x0f][4:0.50] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1) %res1 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> %res, i8 %mask) %res2 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask) @@ -752,12 +716,12 @@ define <2 x double> @test_mask_load_aligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovapd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x07] +; CHECK-NEXT: vmovapd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x07][4:0.50] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x28,0x07] -; CHECK-NEXT: vmovapd (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x28,0x0f] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x28,0x07][4:0.50] +; CHECK-NEXT: vmovapd (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x28,0x0f][4:0.50] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1) %res1 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> %res, i8 %mask) %res2 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask) @@ -770,12 +734,12 @@ define <2 x double> @test_mask_load_unaligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_unaligned_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovupd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x10,0x07] +; CHECK-NEXT: vmovupd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x10,0x07][4:0.50] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x10,0x07] -; CHECK-NEXT: vmovupd (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x10,0x0f] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x10,0x07][4:0.50] +; CHECK-NEXT: vmovupd (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x10,0x0f][4:0.50] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1) %res1 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> %res, i8 %mask) %res2 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask) @@ -790,12 +754,12 @@ define <4 x i32> @test_mask_load_unaligned_d_128(i8* %ptr, i8* %ptr2, <4 x i32> %data, i8 %mask) { ; CHECK-LABEL: test_mask_load_unaligned_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07][4:0.50] ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] -; CHECK-NEXT: vmovdqu32 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x06] -; CHECK-NEXT: vmovdqu32 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x6f,0x0f] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu32 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x06][4:0.50] +; CHECK-NEXT: vmovdqu32 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 -1) %res1 = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr2, <4 x i32> %res, i8 %mask) %res2 = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 %mask) @@ -808,12 +772,12 @@ define <8 x i32> @test_mask_load_unaligned_d_256(i8* %ptr, i8* %ptr2, <8 x i32> %data, i8 %mask) { ; CHECK-LABEL: test_mask_load_unaligned_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07][4:0.50] ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] -; CHECK-NEXT: vmovdqu32 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x06] -; CHECK-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x6f,0x0f] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu32 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x06][4:0.50] +; CHECK-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 -1) %res1 = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr2, <8 x i32> %res, i8 %mask) %res2 = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 %mask) @@ -826,12 +790,12 @@ define <2 x i64> @test_mask_load_unaligned_q_128(i8* %ptr, i8* %ptr2, <2 x i64> %data, i8 %mask) { ; CHECK-LABEL: test_mask_load_unaligned_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07][4:0.50] ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] -; CHECK-NEXT: vmovdqu64 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x06] -; CHECK-NEXT: vmovdqu64 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0x89,0x6f,0x0f] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu64 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x06][4:0.50] +; CHECK-NEXT: vmovdqu64 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0x89,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 -1) %res1 = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr2, <2 x i64> %res, i8 %mask) %res2 = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 %mask) @@ -844,12 +808,12 @@ define <4 x i64> @test_mask_load_unaligned_q_256(i8* %ptr, i8* %ptr2, <4 x i64> %data, i8 %mask) { ; CHECK-LABEL: test_mask_load_unaligned_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07][4:0.50] ; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] -; CHECK-NEXT: vmovdqu64 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x06] -; CHECK-NEXT: vmovdqu64 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xa9,0x6f,0x0f] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqu64 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x06][4:0.50] +; CHECK-NEXT: vmovdqu64 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xa9,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 -1) %res1 = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr2, <4 x i64> %res, i8 %mask) %res2 = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 %mask) @@ -862,12 +826,12 @@ define <4 x i32> @test_mask_load_aligned_d_128(<4 x i32> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07][4:0.50] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6f,0x07] -; CHECK-NEXT: vmovdqa32 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6f,0x0f] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6f,0x07][4:0.50] +; CHECK-NEXT: vmovdqa32 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.load.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 -1) %res1 = call <4 x i32> @llvm.x86.avx512.mask.load.d.128(i8* %ptr, <4 x i32> %res, i8 %mask) %res2 = call <4 x i32> @llvm.x86.avx512.mask.load.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 %mask) @@ -880,12 +844,12 @@ define <8 x i32> @test_mask_load_aligned_d_256(<8 x i32> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07] +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07][4:0.50] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0x07] -; CHECK-NEXT: vmovdqa32 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0x0f] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0x07][4:0.50] +; CHECK-NEXT: vmovdqa32 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.load.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 -1) %res1 = call <8 x i32> @llvm.x86.avx512.mask.load.d.256(i8* %ptr, <8 x i32> %res, i8 %mask) %res2 = call <8 x i32> @llvm.x86.avx512.mask.load.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 %mask) @@ -898,12 +862,12 @@ define <2 x i64> @test_mask_load_aligned_q_128(<2 x i64> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07][4:0.50] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6f,0x07] -; CHECK-NEXT: vmovdqa64 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x6f,0x0f] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6f,0x07][4:0.50] +; CHECK-NEXT: vmovdqa64 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.load.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 -1) %res1 = call <2 x i64> @llvm.x86.avx512.mask.load.q.128(i8* %ptr, <2 x i64> %res, i8 %mask) %res2 = call <2 x i64> @llvm.x86.avx512.mask.load.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 %mask) @@ -916,12 +880,12 @@ define <4 x i64> @test_mask_load_aligned_q_256(<4 x i64> %data, i8* %ptr, i8 %mask) { ; CHECK-LABEL: test_mask_load_aligned_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07] +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07][4:0.50] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0x07] -; CHECK-NEXT: vmovdqa64 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x6f,0x0f] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0x07][4:0.50] +; CHECK-NEXT: vmovdqa64 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.load.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 -1) %res1 = call <4 x i64> @llvm.x86.avx512.mask.load.q.256(i8* %ptr, <4 x i64> %res, i8 %mask) %res2 = call <4 x i64> @llvm.x86.avx512.mask.load.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 %mask) @@ -934,16 +898,13 @@ define <4 x i32>@test_int_x86_avx512_mask_pshuf_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpshufd $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xd0,0x03] -; CHECK-NEXT: ## xmm2 = xmm0[3,0,0,0] +; CHECK-NEXT: vpshufd $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xd0,0x03]xmm2 = xmm0[3,0,0,0][1:1.00] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpshufd $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x70,0xc8,0x03] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[3,0,0,0] -; CHECK-NEXT: vpshufd $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x70,0xc0,0x03] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[3,0,0,0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpshufd $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x70,0xc8,0x03]xmm1 {%k1} = xmm0[3,0,0,0] +; CHECK-NEXT: vpshufd $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x70,0xc0,0x03]xmm0 {%k1} {z} = xmm0[3,0,0,0] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1) @@ -957,16 +918,13 @@ define <8 x i32>@test_int_x86_avx512_mask_pshuf_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpshufd $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x70,0xd0,0x03] -; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: vpshufd $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x70,0xd0,0x03]ymm2 = ymm0[3,0,0,0,7,4,4,4][1:1.00] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpshufd $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x70,0xc8,0x03] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0,7,4,4,4] -; CHECK-NEXT: vpshufd $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x70,0xc0,0x03] -; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0,7,4,4,4] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpshufd $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x70,0xc8,0x03]ymm1 {%k1} = ymm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: vpshufd $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x70,0xc0,0x03]ymm0 {%k1} {z} = ymm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1) @@ -978,10 +936,10 @@ define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_pcmpeq_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1] +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1][?:0.000000e+00] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1) ret i8 %res } @@ -990,10 +948,10 @@ ; CHECK-LABEL: test_mask_pcmpeq_d_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1] +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1][?:0.000000e+00] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask) ret i8 %res } @@ -1003,12 +961,12 @@ define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: test_pcmpeq_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1] +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1][?:0.000000e+00] ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1) ret i8 %res } @@ -1017,12 +975,12 @@ ; CHECK-LABEL: test_mask_pcmpeq_q_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x29,0xc1] +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x29,0xc1][?:0.000000e+00] ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask) ret i8 %res } @@ -1032,10 +990,10 @@ define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_pcmpgt_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x66,0xc1] +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x66,0xc1][?:0.000000e+00] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1) ret i8 %res } @@ -1044,10 +1002,10 @@ ; CHECK-LABEL: test_mask_pcmpgt_d_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc1] +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc1][?:0.000000e+00] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask) ret i8 %res } @@ -1057,12 +1015,12 @@ define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: test_pcmpgt_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x37,0xc1] +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x37,0xc1][?:0.000000e+00] ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1) ret i8 %res } @@ -1071,12 +1029,12 @@ ; CHECK-LABEL: test_mask_pcmpgt_q_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x37,0xc1] +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x37,0xc1][?:0.000000e+00] ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask) ret i8 %res } @@ -1086,12 +1044,12 @@ define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_pcmpeq_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1] +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1][?:0.000000e+00] ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1) ret i8 %res } @@ -1100,12 +1058,12 @@ ; CHECK-LABEL: test_mask_pcmpeq_d_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x76,0xc1] +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x76,0xc1][?:0.000000e+00] ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask) ret i8 %res } @@ -1115,14 +1073,14 @@ define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_pcmpeq_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1] +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1][?:0.000000e+00] ; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e] ; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1) ret i8 %res } @@ -1131,14 +1089,14 @@ ; CHECK-LABEL: test_mask_pcmpeq_q_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x29,0xc1] +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x29,0xc1][?:0.000000e+00] ; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e] ; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask) ret i8 %res } @@ -1148,12 +1106,12 @@ define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_pcmpgt_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x66,0xc1] +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x66,0xc1][?:0.000000e+00] ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1) ret i8 %res } @@ -1162,12 +1120,12 @@ ; CHECK-LABEL: test_mask_pcmpgt_d_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x66,0xc1] +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x66,0xc1][?:0.000000e+00] ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask) ret i8 %res } @@ -1177,14 +1135,14 @@ define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_pcmpgt_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x37,0xc1] +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x37,0xc1][?:0.000000e+00] ; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e] ; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1) ret i8 %res } @@ -1193,14 +1151,14 @@ ; CHECK-LABEL: test_mask_pcmpgt_q_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x37,0xc1] +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x37,0xc1][?:0.000000e+00] ; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e] ; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask) ret i8 %res } @@ -1212,13 +1170,11 @@ define <2 x double>@test_int_x86_avx512_mask_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xd9] -; CHECK-NEXT: ## xmm3 = xmm0[1],xmm1[1] +; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xd9]xmm3 = xmm0[1],xmm1[1][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x15,0xd1] -; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[1] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x15,0xd1]xmm2 {%k1} = xmm0[1],xmm1[1] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -1230,13 +1186,11 @@ define <4 x double>@test_int_x86_avx512_mask_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x15,0xd9] -; CHECK-NEXT: ## ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x15,0xd9]ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x15,0xd1] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x15,0xd1]ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -1248,13 +1202,11 @@ define <4 x float>@test_int_x86_avx512_mask_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xd9] -; CHECK-NEXT: ## xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xd9]xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x15,0xd1] -; CHECK-NEXT: ## xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x15,0xd1]xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -1266,13 +1218,11 @@ define <8 x float>@test_int_x86_avx512_mask_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x15,0xd9] -; CHECK-NEXT: ## ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x15,0xd9]ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x15,0xd1] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x15,0xd1]ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -1284,13 +1234,11 @@ define <2 x double>@test_int_x86_avx512_mask_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x14,0xd9] -; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0] +; CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x14,0xd9]xmm3 = xmm0[0],xmm1[0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x14,0xd1] -; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x14,0xd1]xmm2 {%k1} = xmm0[0],xmm1[0] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -1302,13 +1250,11 @@ define <4 x double>@test_int_x86_avx512_mask_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x14,0xd9] -; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x14,0xd9]ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x14,0xd1] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x14,0xd1]ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -1320,13 +1266,11 @@ define <4 x float>@test_int_x86_avx512_mask_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xd9] -; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xd9]xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x14,0xd1] -; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x14,0xd1]xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -1338,13 +1282,11 @@ define <8 x float>@test_int_x86_avx512_mask_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x14,0xd9] -; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x14,0xd9]ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x14,0xd1] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x14,0xd1]ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -1356,13 +1298,11 @@ define <4 x i32>@test_int_x86_avx512_mask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6a,0xd9] -; CHECK-NEXT: ## xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6a,0xd9]xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6a,0xd1] -; CHECK-NEXT: ## xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6a,0xd1]xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -1374,13 +1314,11 @@ define <4 x i32>@test_int_x86_avx512_mask_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xd9] -; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xd9]xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x62,0xd1] -; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x62,0xd1]xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -1392,13 +1330,11 @@ define <8 x i32>@test_int_x86_avx512_mask_punpckhd_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6a,0xd9] -; CHECK-NEXT: ## ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6a,0xd9]ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6a,0xd1] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6a,0xd1]ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -1410,13 +1346,11 @@ define <8 x i32>@test_int_x86_avx512_mask_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x62,0xd9] -; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x62,0xd9]ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x62,0xd1] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x62,0xd1]ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -1428,13 +1362,11 @@ define <2 x i64>@test_int_x86_avx512_mask_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6d,0xd9] -; CHECK-NEXT: ## xmm3 = xmm0[1],xmm1[1] +; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6d,0xd9]xmm3 = xmm0[1],xmm1[1][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6d,0xd1] -; CHECK-NEXT: ## xmm2 = xmm0[1],xmm1[1] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6d,0xd1]xmm2 = xmm0[1],xmm1[1] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -1446,13 +1378,11 @@ define <2 x i64>@test_int_x86_avx512_mask_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xd9] -; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0] +; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xd9]xmm3 = xmm0[0],xmm1[0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6c,0xd1] -; CHECK-NEXT: ## xmm2 = xmm0[0],xmm1[0] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6c,0xd1]xmm2 = xmm0[0],xmm1[0] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -1464,13 +1394,11 @@ define <4 x i64>@test_int_x86_avx512_mask_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6c,0xd9] -; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6c,0xd9]ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6c,0xd1] -; CHECK-NEXT: ## ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6c,0xd1]ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -1482,13 +1410,11 @@ define <4 x i64>@test_int_x86_avx512_mask_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6d,0xd9] -; CHECK-NEXT: ## ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6d,0xd9]ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6d,0xd1] -; CHECK-NEXT: ## ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6d,0xd1]ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -1498,8 +1424,8 @@ define <4 x i32> @test_mask_and_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_mask_and_epi32_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res } @@ -1508,9 +1434,9 @@ ; CHECK-LABEL: test_mask_and_epi32_rrk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpandd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0xd1][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) ret <4 x i32> %res } @@ -1519,8 +1445,8 @@ ; CHECK-LABEL: test_mask_and_epi32_rrkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpandd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0xc1][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res } @@ -1528,8 +1454,8 @@ define <4 x i32> @test_mask_and_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_and_epi32_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpand (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpand (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res @@ -1539,9 +1465,9 @@ ; CHECK-LABEL: test_mask_and_epi32_rmk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) ret <4 x i32> %res @@ -1551,8 +1477,8 @@ ; CHECK-LABEL: test_mask_and_epi32_rmkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res @@ -1561,8 +1487,8 @@ define <4 x i32> @test_mask_and_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) { ; CHECK-LABEL: test_mask_and_epi32_rmb_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdb,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -1574,9 +1500,9 @@ ; CHECK-LABEL: test_mask_and_epi32_rmbk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdb,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdb,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -1588,8 +1514,8 @@ ; CHECK-LABEL: test_mask_and_epi32_rmbkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdb,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -1602,8 +1528,8 @@ define <8 x i32> @test_mask_and_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_mask_and_epi32_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdb,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdb,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res } @@ -1612,9 +1538,9 @@ ; CHECK-LABEL: test_mask_and_epi32_rrk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpandd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0xd1][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) ret <8 x i32> %res } @@ -1623,8 +1549,8 @@ ; CHECK-LABEL: test_mask_and_epi32_rrkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpandd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0xc1][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask) ret <8 x i32> %res } @@ -1632,8 +1558,8 @@ define <8 x i32> @test_mask_and_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_and_epi32_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpand (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpand (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdb,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res @@ -1643,9 +1569,9 @@ ; CHECK-LABEL: test_mask_and_epi32_rmk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) ret <8 x i32> %res @@ -1655,8 +1581,8 @@ ; CHECK-LABEL: test_mask_and_epi32_rmkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask) ret <8 x i32> %res @@ -1665,8 +1591,8 @@ define <8 x i32> @test_mask_and_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) { ; CHECK-LABEL: test_mask_and_epi32_rmb_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdb,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -1678,9 +1604,9 @@ ; CHECK-LABEL: test_mask_and_epi32_rmbk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdb,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdb,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -1692,8 +1618,8 @@ ; CHECK-LABEL: test_mask_and_epi32_rmbkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdb,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -1706,8 +1632,8 @@ define <4 x i32> @test_mask_or_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_mask_or_epi32_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res } @@ -1716,9 +1642,9 @@ ; CHECK-LABEL: test_mask_or_epi32_rrk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0xd1][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) ret <4 x i32> %res } @@ -1727,8 +1653,8 @@ ; CHECK-LABEL: test_mask_or_epi32_rrkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0xc1][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res } @@ -1736,8 +1662,8 @@ define <4 x i32> @test_mask_or_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_or_epi32_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpor (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpor (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res @@ -1747,9 +1673,9 @@ ; CHECK-LABEL: test_mask_or_epi32_rmk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) ret <4 x i32> %res @@ -1759,8 +1685,8 @@ ; CHECK-LABEL: test_mask_or_epi32_rmkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res @@ -1769,8 +1695,8 @@ define <4 x i32> @test_mask_or_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) { ; CHECK-LABEL: test_mask_or_epi32_rmb_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xeb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xeb,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -1782,9 +1708,9 @@ ; CHECK-LABEL: test_mask_or_epi32_rmbk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xeb,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xeb,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -1796,8 +1722,8 @@ ; CHECK-LABEL: test_mask_or_epi32_rmbkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xeb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xeb,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -1810,8 +1736,8 @@ define <8 x i32> @test_mask_or_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_mask_or_epi32_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xeb,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xeb,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res } @@ -1820,9 +1746,9 @@ ; CHECK-LABEL: test_mask_or_epi32_rrk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0xd1][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) ret <8 x i32> %res } @@ -1831,8 +1757,8 @@ ; CHECK-LABEL: test_mask_or_epi32_rrkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0xc1][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask) ret <8 x i32> %res } @@ -1840,8 +1766,8 @@ define <8 x i32> @test_mask_or_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_or_epi32_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpor (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xeb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpor (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xeb,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res @@ -1851,9 +1777,9 @@ ; CHECK-LABEL: test_mask_or_epi32_rmk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) ret <8 x i32> %res @@ -1863,8 +1789,8 @@ ; CHECK-LABEL: test_mask_or_epi32_rmkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask) ret <8 x i32> %res @@ -1873,8 +1799,8 @@ define <8 x i32> @test_mask_or_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) { ; CHECK-LABEL: test_mask_or_epi32_rmb_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xeb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xeb,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -1886,9 +1812,9 @@ ; CHECK-LABEL: test_mask_or_epi32_rmbk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xeb,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xeb,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -1900,8 +1826,8 @@ ; CHECK-LABEL: test_mask_or_epi32_rmbkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xeb,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xeb,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -1914,8 +1840,8 @@ define <4 x i32> @test_mask_xor_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_mask_xor_epi32_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res } @@ -1924,9 +1850,9 @@ ; CHECK-LABEL: test_mask_xor_epi32_rrk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpxord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0xd1][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) ret <4 x i32> %res } @@ -1935,8 +1861,8 @@ ; CHECK-LABEL: test_mask_xor_epi32_rrkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpxord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0xc1][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res } @@ -1944,8 +1870,8 @@ define <4 x i32> @test_mask_xor_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_xor_epi32_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxor (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxor (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res @@ -1955,9 +1881,9 @@ ; CHECK-LABEL: test_mask_xor_epi32_rmk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpxord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) ret <4 x i32> %res @@ -1967,8 +1893,8 @@ ; CHECK-LABEL: test_mask_xor_epi32_rmkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpxord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res @@ -1977,8 +1903,8 @@ define <4 x i32> @test_mask_xor_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) { ; CHECK-LABEL: test_mask_xor_epi32_rmb_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xef,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xef,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -1990,9 +1916,9 @@ ; CHECK-LABEL: test_mask_xor_epi32_rmbk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xef,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xef,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2004,8 +1930,8 @@ ; CHECK-LABEL: test_mask_xor_epi32_rmbkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2018,8 +1944,8 @@ define <8 x i32> @test_mask_xor_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_mask_xor_epi32_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xef,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xef,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res } @@ -2028,9 +1954,9 @@ ; CHECK-LABEL: test_mask_xor_epi32_rrk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpxord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0xd1][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) ret <8 x i32> %res } @@ -2039,8 +1965,8 @@ ; CHECK-LABEL: test_mask_xor_epi32_rrkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpxord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0xc1][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask) ret <8 x i32> %res } @@ -2048,8 +1974,8 @@ define <8 x i32> @test_mask_xor_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_xor_epi32_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxor (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xef,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxor (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xef,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res @@ -2059,9 +1985,9 @@ ; CHECK-LABEL: test_mask_xor_epi32_rmk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpxord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) ret <8 x i32> %res @@ -2071,8 +1997,8 @@ ; CHECK-LABEL: test_mask_xor_epi32_rmkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpxord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask) ret <8 x i32> %res @@ -2081,8 +2007,8 @@ define <8 x i32> @test_mask_xor_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) { ; CHECK-LABEL: test_mask_xor_epi32_rmb_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xef,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xef,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2094,9 +2020,9 @@ ; CHECK-LABEL: test_mask_xor_epi32_rmbk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xef,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xef,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2108,8 +2034,8 @@ ; CHECK-LABEL: test_mask_xor_epi32_rmbkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xef,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xef,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2122,8 +2048,8 @@ define <4 x i32> @test_mask_andnot_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_mask_andnot_epi32_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res } @@ -2132,9 +2058,9 @@ ; CHECK-LABEL: test_mask_andnot_epi32_rrk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpandnd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0xd1][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) ret <4 x i32> %res } @@ -2143,8 +2069,8 @@ ; CHECK-LABEL: test_mask_andnot_epi32_rrkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpandnd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0xc1][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res } @@ -2152,8 +2078,8 @@ define <4 x i32> @test_mask_andnot_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_andnot_epi32_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandn (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandn (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res @@ -2163,9 +2089,9 @@ ; CHECK-LABEL: test_mask_andnot_epi32_rmk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) ret <4 x i32> %res @@ -2175,8 +2101,8 @@ ; CHECK-LABEL: test_mask_andnot_epi32_rmkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res @@ -2185,8 +2111,8 @@ define <4 x i32> @test_mask_andnot_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) { ; CHECK-LABEL: test_mask_andnot_epi32_rmb_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdf,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2198,9 +2124,9 @@ ; CHECK-LABEL: test_mask_andnot_epi32_rmbk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdf,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdf,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2212,8 +2138,8 @@ ; CHECK-LABEL: test_mask_andnot_epi32_rmbkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdf,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2226,8 +2152,8 @@ define <8 x i32> @test_mask_andnot_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_mask_andnot_epi32_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res } @@ -2236,9 +2162,9 @@ ; CHECK-LABEL: test_mask_andnot_epi32_rrk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpandnd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0xd1][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) ret <8 x i32> %res } @@ -2247,8 +2173,8 @@ ; CHECK-LABEL: test_mask_andnot_epi32_rrkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpandnd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0xc1][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask) ret <8 x i32> %res } @@ -2256,8 +2182,8 @@ define <8 x i32> @test_mask_andnot_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_andnot_epi32_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandn (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandn (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res @@ -2267,9 +2193,9 @@ ; CHECK-LABEL: test_mask_andnot_epi32_rmk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) ret <8 x i32> %res @@ -2279,8 +2205,8 @@ ; CHECK-LABEL: test_mask_andnot_epi32_rmkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask) ret <8 x i32> %res @@ -2289,8 +2215,8 @@ define <8 x i32> @test_mask_andnot_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) { ; CHECK-LABEL: test_mask_andnot_epi32_rmb_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdf,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2302,9 +2228,9 @@ ; CHECK-LABEL: test_mask_andnot_epi32_rmbk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdf,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdf,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2316,8 +2242,8 @@ ; CHECK-LABEL: test_mask_andnot_epi32_rmbkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdf,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2330,8 +2256,8 @@ define <2 x i64> @test_mask_andnot_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_mask_andnot_epi64_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1) ret <2 x i64> %res } @@ -2340,9 +2266,9 @@ ; CHECK-LABEL: test_mask_andnot_epi64_rrk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0xd1][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) ret <2 x i64> %res } @@ -2351,8 +2277,8 @@ ; CHECK-LABEL: test_mask_andnot_epi64_rrkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0xc1][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask) ret <2 x i64> %res } @@ -2360,8 +2286,8 @@ define <2 x i64> @test_mask_andnot_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b) { ; CHECK-LABEL: test_mask_andnot_epi64_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandn (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandn (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <2 x i64>, <2 x i64>* %ptr_b %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1) ret <2 x i64> %res @@ -2371,9 +2297,9 @@ ; CHECK-LABEL: test_mask_andnot_epi64_rmk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <2 x i64>, <2 x i64>* %ptr_b %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) ret <2 x i64> %res @@ -2383,8 +2309,8 @@ ; CHECK-LABEL: test_mask_andnot_epi64_rmkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <2 x i64>, <2 x i64>* %ptr_b %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask) ret <2 x i64> %res @@ -2393,8 +2319,8 @@ define <2 x i64> @test_mask_andnot_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) { ; CHECK-LABEL: test_mask_andnot_epi64_rmb_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xdf,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0 %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer @@ -2406,9 +2332,9 @@ ; CHECK-LABEL: test_mask_andnot_epi64_rmbk_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xdf,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xdf,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0 %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer @@ -2420,8 +2346,8 @@ ; CHECK-LABEL: test_mask_andnot_epi64_rmbkz_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xdf,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0 %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer @@ -2434,8 +2360,8 @@ define <4 x i64> @test_mask_andnot_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: test_mask_andnot_epi64_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1) ret <4 x i64> %res } @@ -2444,9 +2370,9 @@ ; CHECK-LABEL: test_mask_andnot_epi64_rrk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0xd1][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) ret <4 x i64> %res } @@ -2455,8 +2381,8 @@ ; CHECK-LABEL: test_mask_andnot_epi64_rrkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0xc1][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask) ret <4 x i64> %res } @@ -2464,8 +2390,8 @@ define <4 x i64> @test_mask_andnot_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b) { ; CHECK-LABEL: test_mask_andnot_epi64_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandn (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandn (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i64>, <4 x i64>* %ptr_b %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1) ret <4 x i64> %res @@ -2475,9 +2401,9 @@ ; CHECK-LABEL: test_mask_andnot_epi64_rmk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i64>, <4 x i64>* %ptr_b %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) ret <4 x i64> %res @@ -2487,8 +2413,8 @@ ; CHECK-LABEL: test_mask_andnot_epi64_rmkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i64>, <4 x i64>* %ptr_b %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask) ret <4 x i64> %res @@ -2497,8 +2423,8 @@ define <4 x i64> @test_mask_andnot_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) { ; CHECK-LABEL: test_mask_andnot_epi64_rmb_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xdf,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0 %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer @@ -2510,9 +2436,9 @@ ; CHECK-LABEL: test_mask_andnot_epi64_rmbk_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xdf,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xdf,0x0f][?:0.000000e+00] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0 %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer @@ -2524,8 +2450,8 @@ ; CHECK-LABEL: test_mask_andnot_epi64_rmbkz_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xdf,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xdf,0x07][?:0.000000e+00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0 %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer @@ -2538,8 +2464,8 @@ define <4 x i32> @test_mask_add_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_mask_add_epi32_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res } @@ -2549,8 +2475,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) ret <4 x i32> %res } @@ -2560,7 +2486,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res } @@ -2568,8 +2494,8 @@ define <4 x i32> @test_mask_add_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_add_epi32_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res @@ -2580,8 +2506,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) ret <4 x i32> %res @@ -2592,7 +2518,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res @@ -2602,7 +2528,7 @@ ; CHECK-LABEL: test_mask_add_epi32_rmb_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpaddd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xfe,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2615,8 +2541,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpaddd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfe,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2629,7 +2555,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpaddd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfe,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2642,8 +2568,8 @@ define <4 x i32> @test_mask_sub_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_mask_sub_epi32_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res } @@ -2653,8 +2579,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) ret <4 x i32> %res } @@ -2664,7 +2590,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res } @@ -2672,8 +2598,8 @@ define <4 x i32> @test_mask_sub_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_sub_epi32_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsubd (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpsubd (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res @@ -2684,8 +2610,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsubd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) ret <4 x i32> %res @@ -2696,7 +2622,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsubd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res @@ -2706,7 +2632,7 @@ ; CHECK-LABEL: test_mask_sub_epi32_rmb_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpsubd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xfa,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2719,8 +2645,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsubd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfa,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2733,7 +2659,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsubd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfa,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2746,8 +2672,8 @@ define <8 x i32> @test_mask_sub_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_mask_sub_epi32_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfa,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfa,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res } @@ -2757,8 +2683,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) ret <8 x i32> %res } @@ -2768,7 +2694,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask) ret <8 x i32> %res } @@ -2776,8 +2702,8 @@ define <8 x i32> @test_mask_sub_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_sub_epi32_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsubd (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfa,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpsubd (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfa,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res @@ -2788,8 +2714,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsubd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) ret <8 x i32> %res @@ -2800,7 +2726,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsubd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask) ret <8 x i32> %res @@ -2810,7 +2736,7 @@ ; CHECK-LABEL: test_mask_sub_epi32_rmb_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vpsubd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xfa,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2823,8 +2749,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsubd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfa,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2837,7 +2763,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsubd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfa,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2850,8 +2776,8 @@ define <8 x i32> @test_mask_add_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_mask_add_epi32_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res } @@ -2861,8 +2787,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) ret <8 x i32> %res } @@ -2872,7 +2798,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask) ret <8 x i32> %res } @@ -2880,8 +2806,8 @@ define <8 x i32> @test_mask_add_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_add_epi32_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0x07][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res @@ -2892,8 +2818,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) ret <8 x i32> %res @@ -2904,7 +2830,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask) ret <8 x i32> %res @@ -2914,7 +2840,7 @@ ; CHECK-LABEL: test_mask_add_epi32_rmb_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vpaddd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xfe,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2927,8 +2853,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpaddd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfe,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2941,7 +2867,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpaddd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfe,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i32, i32* %ptr_b %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2956,7 +2882,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask) ret <8 x float> %res } @@ -2966,8 +2892,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x58,0xd1] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) ret <8 x float> %res } @@ -2975,8 +2901,8 @@ define <8 x float> @test_mm512_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_add_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1) ret <8 x float> %res } @@ -2987,7 +2913,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask) ret <4 x float> %res } @@ -2997,8 +2923,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x58,0xd1] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) ret <4 x float> %res } @@ -3006,8 +2932,8 @@ define <4 x float> @test_mm512_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_add_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1) ret <4 x float> %res } @@ -3018,7 +2944,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5c,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask) ret <8 x float> %res } @@ -3028,8 +2954,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5c,0xd1] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) ret <8 x float> %res } @@ -3037,8 +2963,8 @@ define <8 x float> @test_mm512_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_sub_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5c,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5c,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1) ret <8 x float> %res } @@ -3049,7 +2975,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5c,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask) ret <4 x float> %res } @@ -3059,8 +2985,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5c,0xd1] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) ret <4 x float> %res } @@ -3068,8 +2994,8 @@ define <4 x float> @test_mm512_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_sub_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5c,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5c,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1) ret <4 x float> %res } @@ -3080,7 +3006,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x59,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask) ret <8 x float> %res } @@ -3090,8 +3016,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x59,0xd1] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) ret <8 x float> %res } @@ -3099,8 +3025,8 @@ define <8 x float> @test_mm512_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_mul_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x59,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x59,0xc1][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1) ret <8 x float> %res } @@ -3111,7 +3037,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x59,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask) ret <4 x float> %res } @@ -3121,8 +3047,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x59,0xd1] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) ret <4 x float> %res } @@ -3130,8 +3056,8 @@ define <4 x float> @test_mm512_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_mul_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x59,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x59,0xc1][5:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1) ret <4 x float> %res } @@ -3142,7 +3068,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5e,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask) ret <8 x float> %res } @@ -3152,8 +3078,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5e,0xd1] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) ret <8 x float> %res } @@ -3161,8 +3087,8 @@ define <8 x float> @test_mm512_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_div_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5e,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5e,0xc1][19:2.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1) ret <8 x float> %res } @@ -3173,7 +3099,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5e,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask) ret <4 x float> %res } @@ -3183,8 +3109,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5e,0xd1] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) ret <4 x float> %res } @@ -3192,8 +3118,8 @@ define <4 x float> @test_mm512_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_div_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5e,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5e,0xc1][12:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1) ret <4 x float> %res } @@ -3204,16 +3130,13 @@ define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xd9,0x01] -; CHECK-NEXT: ## xmm3 = xmm0[1],xmm1[0] +; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xd9,0x01]xmm3 = xmm0[1],xmm1[0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01] -; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[0] -; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1],xmm1[0] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb] -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01]xmm2 {%k1} = xmm0[1],xmm1[0] +; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01]xmm0 {%k1} {z} = xmm0[1],xmm1[0] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb][3:1.00] +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1) %res2 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> zeroinitializer, i8 %x4) @@ -3227,13 +3150,11 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xd9,0x06] -; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[1],ymm0[3],ymm1[2] +; CHECK-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xd9,0x06]ymm3 = ymm0[0],ymm1[1],ymm0[3],ymm1[2][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xc6,0xd1,0x06] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[1],ymm0[3],ymm1[2] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xc6,0xd1,0x06]ymm2 {%k1} = ymm0[0],ymm1[1],ymm0[3],ymm1[2] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 6, <4 x double> %x3, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 6, <4 x double> %x3, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -3245,13 +3166,11 @@ define <4 x float>@test_int_x86_avx512_mask_shuf_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xd9,0x16] -; CHECK-NEXT: ## xmm3 = xmm0[2,1],xmm1[1,0] +; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xd9,0x16]xmm3 = xmm0[2,1],xmm1[1,0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0xc6,0xd1,0x16] -; CHECK-NEXT: ## xmm2 {%k1} = xmm0[2,1],xmm1[1,0] -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0xc6,0xd1,0x16]xmm2 {%k1} = xmm0[2,1],xmm1[1,0] +; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 %x4) %res1 = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -3263,13 +3182,11 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xd9,0x16] -; CHECK-NEXT: ## ymm3 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] +; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xd9,0x16]ymm3 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0xc6,0xd1,0x16] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0xc6,0xd1,0x16]ymm2 {%k1} = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -3284,8 +3201,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3d,0xd1] ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3d,0xc1] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2 ,i8 %mask) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) %res2 = add <4 x i32> %res, %res1 @@ -3297,11 +3214,11 @@ define <8 x i32>@test_int_x86_avx512_mask_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xd9] +; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xd9][1:0.50] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3d,0xd1] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -3316,8 +3233,8 @@ ; CHECK-NEXT: vpmaxsq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x3d,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmaxsq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x3d,0xd1] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -3332,8 +3249,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmaxsq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x3d,0xd1] ; CHECK-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x3d,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) %res2 = add <4 x i64> %res, %res1 @@ -3348,8 +3265,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3f,0xd1] ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3f,0xc1] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) %res2 = add <4 x i32> %res, %res1 @@ -3361,11 +3278,11 @@ define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xd9] +; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xd9][1:0.50] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3f,0xd1] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -3380,8 +3297,8 @@ ; CHECK-NEXT: vpmaxuq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x3f,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmaxuq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x3f,0xd1] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -3396,8 +3313,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmaxuq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x3f,0xd1] ; CHECK-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x3f,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) %res2 = add <4 x i64> %res, %res1 @@ -3412,8 +3329,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x39,0xd1] ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x39,0xc1] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) %res2 = add <4 x i32> %res, %res1 @@ -3425,11 +3342,11 @@ define <8 x i32>@test_int_x86_avx512_mask_pmins_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xd9] +; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xd9][1:0.50] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x39,0xd1] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -3444,8 +3361,8 @@ ; CHECK-NEXT: vpminsq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x39,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpminsq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x39,0xd1] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -3460,8 +3377,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpminsq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x39,0xd1] ; CHECK-NEXT: vpminsq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x39,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) %res2 = add <4 x i64> %res, %res1 @@ -3476,8 +3393,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3b,0xd1] ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3b,0xc1] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) %res2 = add <4 x i32> %res, %res1 @@ -3489,11 +3406,11 @@ define <8 x i32>@test_int_x86_avx512_mask_pminu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xd9] +; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xd9][1:0.50] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3b,0xd1] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -3508,8 +3425,8 @@ ; CHECK-NEXT: vpminuq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x3b,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpminuq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x3b,0xd1] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -3524,8 +3441,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpminuq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x3b,0xd1] ; CHECK-NEXT: vpminuq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x3b,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) %res2 = add <4 x i64> %res, %res1 @@ -3537,13 +3454,13 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9] +; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9][2:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xc1] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb][1:0.50] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) @@ -3557,13 +3474,13 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9] +; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9][2:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xc1] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb][1:0.50] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1) %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) @@ -3577,13 +3494,13 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9] +; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9][2:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xc1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb][1:0.50] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3) @@ -3597,13 +3514,13 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9] +; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9][2:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xc1] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb][1:0.50] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1) %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) @@ -3617,13 +3534,13 @@ define <4 x i32>@test_int_x86_avx512_mask_psra_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe2,0xd9] +; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe2,0xd9][2:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe2,0xd1] ; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe2,0xc1] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -3637,13 +3554,13 @@ define <8 x i32>@test_int_x86_avx512_mask_psra_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xd9] +; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xd9][2:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe2,0xd1] ; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe2,0xc1] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -3657,13 +3574,13 @@ define <4 x i32>@test_int_x86_avx512_mask_psll_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf2,0xd9] +; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf2,0xd9][2:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf2,0xd1] ; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf2,0xc1] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -3677,13 +3594,13 @@ define <8 x i32>@test_int_x86_avx512_mask_psll_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xd9] +; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xd9][2:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf2,0xd1] ; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf2,0xc1] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -3697,13 +3614,13 @@ define <4 x i64>@test_int_x86_avx512_mask_psll_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xd9] +; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xd9][2:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf3,0xd1] ; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf3,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -3717,13 +3634,13 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03] +; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03][1:1.00] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03] ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] -; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] -; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca][1:0.50] +; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 -1) %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> zeroinitializer, i8 %x3) @@ -3737,13 +3654,13 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03] +; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03][1:1.00] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03] ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] -; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca][1:0.50] +; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1) %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3) @@ -3757,13 +3674,13 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03] +; CHECK-NEXT: vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03][1:1.00] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03] ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca][1:0.50] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1) %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3) @@ -3777,13 +3694,13 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03] +; CHECK-NEXT: vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03][1:1.00] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03] ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] -; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca][1:0.50] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1) %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3) @@ -3797,13 +3714,13 @@ define <4 x i32>@test_int_x86_avx512_mask_psll_di_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpslld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xf0,0x03] +; CHECK-NEXT: vpslld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xf0,0x03][1:1.00] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpslld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xf0,0x03] ; CHECK-NEXT: vpslld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xf0,0x03] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1) @@ -3817,13 +3734,13 @@ define <8 x i32>@test_int_x86_avx512_mask_psll_di_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpslld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xf0,0x03] +; CHECK-NEXT: vpslld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xf0,0x03][1:1.00] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpslld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xf0,0x03] ; CHECK-NEXT: vpslld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xf0,0x03] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1) @@ -3837,13 +3754,13 @@ define <2 x i64>@test_int_x86_avx512_mask_psrlv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv2_di: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xd9] +; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xd9][2:2.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x45,0xd1] ; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x45,0xc1] -; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -3857,13 +3774,13 @@ define <4 x i64>@test_int_x86_avx512_mask_psrlv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_di: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xd9] +; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xd9][2:2.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x45,0xd1] ; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x45,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -3877,13 +3794,13 @@ define <4 x i32>@test_int_x86_avx512_mask_psrlv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_si: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xd9] +; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xd9][2:2.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x45,0xd1] ; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x45,0xc1] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -3897,13 +3814,13 @@ define <8 x i32>@test_int_x86_avx512_mask_psrlv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_si: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xd9] +; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xd9][2:2.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x45,0xd1] ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x45,0xc1] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -3917,13 +3834,13 @@ define <4 x i32>@test_int_x86_avx512_mask_psrav4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrav4_si: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xd9] +; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xd9][2:2.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x46,0xd1] ; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x46,0xc1] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -3937,13 +3854,13 @@ define <8 x i32>@test_int_x86_avx512_mask_psrav8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_si: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xd9] +; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xd9][2:2.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x46,0xd1] ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x46,0xc1] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -3955,12 +3872,13 @@ define <8 x i32>@test_int_x86_avx512_mask_psrav8_si_const() { ; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_si_const: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; CHECK-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; CHECK-NEXT: vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; CHECK-NEXT: ## fixup A - offset: 4, value: LCPI276_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## [4:0.50] ; CHECK-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; CHECK-NEXT: ## fixup A - offset: 5, value: LCPI276_1-4, kind: reloc_riprel_4byte -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: ## [6:2.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> , <8 x i32> , <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res } @@ -3970,13 +3888,13 @@ define <2 x i64>@test_int_x86_avx512_mask_psllv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psllv2_di: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xd9] +; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xd9][2:2.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x47,0xd1] ; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x47,0xc1] -; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) %res2 = call <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -3990,13 +3908,13 @@ define <4 x i64>@test_int_x86_avx512_mask_psllv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_di: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xd9] +; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xd9][2:2.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x47,0xd1] ; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x47,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -4010,13 +3928,13 @@ define <4 x i32>@test_int_x86_avx512_mask_psllv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_si: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xd9] +; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xd9][2:2.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x47,0xd1] ; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x47,0xc1] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -4030,13 +3948,13 @@ define <8 x i32>@test_int_x86_avx512_mask_psllv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_si: ; CHECK: ## BB#0: -; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xd9] +; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xd9][2:2.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x47,0xd1] ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x47,0xc1] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -4050,16 +3968,13 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovzxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxbd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x31,0xd0] -; CHECK-NEXT: ## xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: vpmovzxbd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x31,0xd0]xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero[1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpmovzxbd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x31,0xc8] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-NEXT: vpmovzxbd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x31,0xc0] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmovzxbd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x31,0xc8]xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: vpmovzxbd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x31,0xc0]xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1) @@ -4073,16 +3988,13 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovzxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxbd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x31,0xd0] -; CHECK-NEXT: ## ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x31,0xd0]ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero[1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpmovzxbd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x31,0xc8] -; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; CHECK-NEXT: vpmovzxbd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x31,0xc0] -; CHECK-NEXT: ## ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmovzxbd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x31,0xc8]ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x31,0xc0]ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2) %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1) @@ -4096,16 +4008,13 @@ define <2 x i64>@test_int_x86_avx512_mask_pmovzxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxbq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xd0] -; CHECK-NEXT: ## xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpmovzxbq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xd0]xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero[1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x32,0xc8] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpmovzxbq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x32,0xc0] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x32,0xc8]xmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpmovzxbq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x32,0xc0]xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1) @@ -4119,16 +4028,13 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovzxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxbq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x32,0xd0] -; CHECK-NEXT: ## ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpmovzxbq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x32,0xd0]ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero[3:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpmovzxbq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x32,0xc8] -; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpmovzxbq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x32,0xc0] -; CHECK-NEXT: ## ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmovzxbq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x32,0xc8]ymm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpmovzxbq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x32,0xc0]ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1) @@ -4142,16 +4048,13 @@ define <2 x i64>@test_int_x86_avx512_mask_pmovzxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxdq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xd0] -; CHECK-NEXT: ## xmm2 = xmm0[0],zero,xmm0[1],zero +; CHECK-NEXT: vpmovzxdq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xd0]xmm2 = xmm0[0],zero,xmm0[1],zero[1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpmovzxdq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x35,0xc8] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero -; CHECK-NEXT: vpmovzxdq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x35,0xc0] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmovzxdq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x35,0xc8]xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero +; CHECK-NEXT: vpmovzxdq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x35,0xc0]xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1) @@ -4165,16 +4068,13 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovzxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxdq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x35,0xd0] -; CHECK-NEXT: ## ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: vpmovzxdq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x35,0xd0]ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero[3:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpmovzxdq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x35,0xc8] -; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x35,0xc0] -; CHECK-NEXT: ## ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmovzxdq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x35,0xc8]ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x35,0xc0]ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1) @@ -4188,16 +4088,13 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovzxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxwd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xd0] -; CHECK-NEXT: ## xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: vpmovzxwd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xd0]xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero[1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpmovzxwd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x33,0xc8] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x33,0xc0] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmovzxwd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x33,0xc8]xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x33,0xc0]xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1) @@ -4211,16 +4108,13 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovzxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxwd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xd0] -; CHECK-NEXT: ## ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: vpmovzxwd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xd0]ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero[1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpmovzxwd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x33,0xc8] -; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x33,0xc0] -; CHECK-NEXT: ## ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmovzxwd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x33,0xc8]ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x33,0xc0]ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2) %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1) @@ -4234,16 +4128,13 @@ define <2 x i64>@test_int_x86_avx512_mask_pmovzxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxwq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x34,0xd0] -; CHECK-NEXT: ## xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; CHECK-NEXT: vpmovzxwq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x34,0xd0]xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero[1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpmovzxwq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x34,0xc8] -; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; CHECK-NEXT: vpmovzxwq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x34,0xc0] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmovzxwq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x34,0xc8]xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; CHECK-NEXT: vpmovzxwq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x34,0xc0]xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1) @@ -4257,16 +4148,13 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovzxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxwq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x34,0xd0] -; CHECK-NEXT: ## ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: vpmovzxwq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x34,0xd0]ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero[1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpmovzxwq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x34,0xc8] -; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-NEXT: vpmovzxwq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x34,0xc0] -; CHECK-NEXT: ## ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmovzxwq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x34,0xc8]ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: vpmovzxwq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x34,0xc0]ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1) @@ -4280,13 +4168,13 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovsxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0xd0] +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0xd0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmovsxbd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x21,0xc8] ; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x21,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1) @@ -4300,13 +4188,13 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovsxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0xd0] +; CHECK-NEXT: vpmovsxbd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0xd0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmovsxbd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x21,0xc8] ; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x21,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2) %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1) @@ -4320,13 +4208,13 @@ define <2 x i64>@test_int_x86_avx512_mask_pmovsxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0xd0] +; CHECK-NEXT: vpmovsxbq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0xd0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmovsxbq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x22,0xc8] ; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x22,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1) @@ -4340,13 +4228,13 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovsxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0xd0] +; CHECK-NEXT: vpmovsxbq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0xd0][3:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmovsxbq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x22,0xc8] ; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x22,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1) @@ -4360,13 +4248,13 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovsxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x23,0xd0] +; CHECK-NEXT: vpmovsxwd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x23,0xd0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x23,0xc8] ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x23,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1) @@ -4380,13 +4268,13 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovsxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xd0] +; CHECK-NEXT: vpmovsxwd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xd0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmovsxwd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x23,0xc8] ; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x23,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2) %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1) @@ -4400,13 +4288,13 @@ define <2 x i64>@test_int_x86_avx512_mask_pmovsxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x24,0xd0] +; CHECK-NEXT: vpmovsxwq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x24,0xd0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x24,0xc8] ; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x24,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1) @@ -4420,13 +4308,13 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x24,0xd0] +; CHECK-NEXT: vpmovsxwq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x24,0xd0][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x24,0xc8] ; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x24,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1) @@ -4444,9 +4332,9 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xe2,0xd1] ; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xe2,0xc1] -; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) %res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -4464,9 +4352,9 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xe2,0xd1] ; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xe2,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -4484,9 +4372,9 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsraq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xe0,0x03] ; CHECK-NEXT: vpsraq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x72,0xe0,0x03] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i32 3, <2 x i64> zeroinitializer, i8 %x3) %res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 -1) @@ -4504,9 +4392,9 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsraq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xe0,0x03] ; CHECK-NEXT: vpsraq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x72,0xe0,0x03] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1) @@ -4524,9 +4412,9 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x46,0xd1] ; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x46,0xc1] -; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -4538,12 +4426,12 @@ define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128_const(i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_128_const: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,18446744073709551607] -; CHECK-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; CHECK-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,18446744073709551607]encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; CHECK-NEXT: ## fixup A - offset: 4, value: LCPI304_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## [4:0.50] ; CHECK-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0x05,A,A,A,A] ; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI304_1-4, kind: reloc_riprel_4byte -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> , <2 x i64> , <2 x i64> zeroinitializer, i8 -1) ret <2 x i64> %res } @@ -4557,9 +4445,9 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x46,0xd1] ; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x46,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -4573,11 +4461,11 @@ define <2 x double>@test_int_x86_avx512_mask_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0xe6,0xd0] +; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0xe6,0xd0][4:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0xe6,0xc8] -; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -4589,11 +4477,11 @@ define <4 x double>@test_int_x86_avx512_mask_cvt_dq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0xe6,0xd0] +; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0xe6,0xd0][6:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0xe6,0xc8] -; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -4608,8 +4496,8 @@ ; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7e,0x08,0x7a,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x7a,0xc8] -; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -4624,8 +4512,8 @@ ; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm2 ## encoding: [0x62,0xf1,0x7e,0x28,0x7a,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x7a,0xc8] -; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -4637,16 +4525,13 @@ define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf3,0x7d,0x08,0x03,0xd9,0x02] -; CHECK-NEXT: ## xmm3 = xmm1[2,3],xmm0[0,1] +; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf3,0x7d,0x08,0x03,0xd9,0x02]xmm3 = xmm1[2,3],xmm0[0,1] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02] -; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1] -; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] -; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02]xmm2 {%k1} = xmm1[2,3],xmm0[0,1] +; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02]xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb][1:0.50] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4) %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1) %res2 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> zeroinitializer,i8 %x4) @@ -4660,13 +4545,11 @@ define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x03,0xd9,0x06] -; CHECK-NEXT: ## ymm3 = ymm1[6,7],ymm0[0,1,2,3,4,5] +; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x03,0xd9,0x06]ymm3 = ymm1[6,7],ymm0[0,1,2,3,4,5] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06] -; CHECK-NEXT: ## ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06]ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 6, <8 x i32> %x3, i8 %x4) %res1 = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 6, <8 x i32> %x3, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -4678,13 +4561,11 @@ define <2 x i64>@test_int_x86_avx512_mask_valign_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: valignq $1, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf3,0xfd,0x08,0x03,0xd9,0x01] -; CHECK-NEXT: ## xmm3 = xmm1[1],xmm0[0] +; CHECK-NEXT: valignq $1, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf3,0xfd,0x08,0x03,0xd9,0x01]xmm3 = xmm1[1],xmm0[0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: valignq $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x03,0xd1,0x01] -; CHECK-NEXT: ## xmm2 {%k1} = xmm1[1],xmm0[0] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: valignq $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x03,0xd1,0x01]xmm2 {%k1} = xmm1[1],xmm0[0] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 1, <2 x i64> %x3, i8 %x4) %res1 = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 1, <2 x i64> %x3, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -4696,13 +4577,11 @@ define <4 x i64>@test_int_x86_avx512_mask_valign_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03] -; CHECK-NEXT: ## ymm3 = ymm1[3],ymm0[0,1,2] +; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03]ymm3 = ymm1[3],ymm0[0,1,2] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x03,0xd1,0x03] -; CHECK-NEXT: ## ymm2 {%k1} = ymm1[3],ymm0[0,1,2] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x03,0xd1,0x03]ymm2 {%k1} = ymm1[3],ymm0[0,1,2] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 3, <4 x i64> %x3, i8 %x4) %res1 = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 3, <4 x i64> %x3, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -4714,13 +4593,13 @@ define <4 x double>@test_int_x86_avx512_mask_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0d,0xd9] +; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0d,0xd9][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x0d,0xd1] ; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x0d,0xc1] -; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3) %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) @@ -4734,13 +4613,13 @@ define <2 x double>@test_int_x86_avx512_mask_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0d,0xd9] +; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0d,0xd9][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x0d,0xd1] ; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x0d,0xc1] -; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0] -; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> zeroinitializer, i8 %x3) %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) @@ -4754,13 +4633,13 @@ define <8 x float>@test_int_x86_avx512_mask_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0c,0xd9] +; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0c,0xd9][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x0c,0xd1] ; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x0c,0xc1] -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3) %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) @@ -4774,13 +4653,13 @@ define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0xd9] +; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0xd9][1:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x0c,0xd1] ; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x0c,0xc1] -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> zeroinitializer, i8 %x3) %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) @@ -4794,13 +4673,13 @@ define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01][3:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc0,0x01] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3) %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1) @@ -4814,13 +4693,13 @@ define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01][3:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb][3:1.00] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) %res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4) @@ -4834,13 +4713,13 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01][3:1.00] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb][1:0.50] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) %res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1) @@ -4855,7 +4734,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask) ret <8 x float> %res } @@ -4865,8 +4744,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5f,0xd1] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) ret <8 x float> %res } @@ -4874,8 +4753,8 @@ define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_max_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5f,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1) ret <8 x float> %res } @@ -4886,7 +4765,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask) ret <4 x float> %res } @@ -4896,8 +4775,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5f,0xd1] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) ret <4 x float> %res } @@ -4905,8 +4784,8 @@ define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_max_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1) ret <4 x float> %res } @@ -4917,7 +4796,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5d,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask) ret <8 x float> %res } @@ -4927,8 +4806,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5d,0xd1] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) ret <8 x float> %res } @@ -4936,8 +4815,8 @@ define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_min_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5d,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5d,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1) ret <8 x float> %res } @@ -4948,7 +4827,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5d,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask) ret <4 x float> %res } @@ -4958,8 +4837,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5d,0xd1] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) ret <4 x float> %res } @@ -4967,8 +4846,8 @@ define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_min_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1) ret <4 x float> %res } Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -6,31 +6,31 @@ define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: test_cmp_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x00] -; CHECK-NEXT: vpcmpltd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x01] -; CHECK-NEXT: vpcmpled %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd1,0x02] -; CHECK-NEXT: vpcmpunordd %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd9,0x03] -; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xe1,0x04] -; CHECK-NEXT: vpcmpnltd %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xe9,0x05] -; CHECK-NEXT: vpcmpnled %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xf1,0x06] -; CHECK-NEXT: vpcmpordd %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xf9,0x07] +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpled %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd1,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunordd %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd9,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xe1,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltd %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xe9,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnled %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xf1,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmpordd %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xf9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1) @@ -54,31 +54,31 @@ ; CHECK-LABEL: test_mask_cmp_d_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x00] -; CHECK-NEXT: vpcmpltd %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd1,0x01] -; CHECK-NEXT: vpcmpled %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd9,0x02] -; CHECK-NEXT: vpcmpunordd %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xe1,0x03] -; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xe9,0x04] -; CHECK-NEXT: vpcmpnltd %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xf1,0x05] -; CHECK-NEXT: vpcmpnled %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xf9,0x06] -; CHECK-NEXT: vpcmpordd %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc9,0x07] +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltd %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd1,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpled %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd9,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunordd %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xe1,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xe9,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltd %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xf1,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnled %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xf9,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmpordd %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask) @@ -103,31 +103,31 @@ define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: test_ucmp_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpequd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x00] -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc9,0x01] -; CHECK-NEXT: vpcmpleud %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd1,0x02] -; CHECK-NEXT: vpcmpunordud %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd9,0x03] -; CHECK-NEXT: vpcmpnequd %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe1,0x04] -; CHECK-NEXT: vpcmpnltud %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe9,0x05] -; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xf1,0x06] -; CHECK-NEXT: vpcmpordud %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xf9,0x07] +; CHECK-NEXT: vpcmpequd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc9,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpleud %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd1,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunordud %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd9,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpnequd %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe1,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltud %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe9,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xf1,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmpordud %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xf9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1) @@ -151,31 +151,31 @@ ; CHECK-LABEL: test_mask_ucmp_d_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x00] -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd1,0x01] -; CHECK-NEXT: vpcmpleud %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd9,0x02] -; CHECK-NEXT: vpcmpunordud %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xe1,0x03] -; CHECK-NEXT: vpcmpnequd %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xe9,0x04] -; CHECK-NEXT: vpcmpnltud %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xf1,0x05] -; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xf9,0x06] -; CHECK-NEXT: vpcmpordud %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc9,0x07] +; CHECK-NEXT: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd1,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpleud %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd9,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunordud %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xe1,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpnequd %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xe9,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltud %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xf1,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xf9,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmpordud %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask) @@ -200,31 +200,31 @@ define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_cmp_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x00] -; CHECK-NEXT: vpcmpltq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x01] -; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd1,0x02] -; CHECK-NEXT: vpcmpunordq %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd9,0x03] -; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe1,0x04] -; CHECK-NEXT: vpcmpnltq %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe9,0x05] -; CHECK-NEXT: vpcmpnleq %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xf1,0x06] -; CHECK-NEXT: vpcmpordq %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xf9,0x07] +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd1,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunordq %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd9,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe1,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltq %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe9,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnleq %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xf1,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmpordq %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xf9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1) @@ -248,31 +248,31 @@ ; CHECK-LABEL: test_mask_cmp_q_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc1,0x00] -; CHECK-NEXT: vpcmpltq %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd1,0x01] -; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd9,0x02] -; CHECK-NEXT: vpcmpunordq %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe1,0x03] -; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe9,0x04] -; CHECK-NEXT: vpcmpnltq %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xf1,0x05] -; CHECK-NEXT: vpcmpnleq %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xf9,0x06] -; CHECK-NEXT: vpcmpordq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc9,0x07] +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltq %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd1,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd9,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunordq %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe1,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe9,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltq %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xf1,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnleq %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xf9,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmpordq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask) @@ -297,31 +297,31 @@ define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_ucmp_q_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpequq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x00] -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc9,0x01] -; CHECK-NEXT: vpcmpleuq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd1,0x02] -; CHECK-NEXT: vpcmpunorduq %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd9,0x03] -; CHECK-NEXT: vpcmpnequq %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe1,0x04] -; CHECK-NEXT: vpcmpnltuq %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe9,0x05] -; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xf1,0x06] -; CHECK-NEXT: vpcmporduq %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xf9,0x07] +; CHECK-NEXT: vpcmpequq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc9,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpleuq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd1,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunorduq %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd9,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpnequq %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe1,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltuq %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe9,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xf1,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmporduq %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xf9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1) @@ -345,31 +345,31 @@ ; CHECK-LABEL: test_mask_ucmp_q_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc1,0x00] -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd1,0x01] -; CHECK-NEXT: vpcmpleuq %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd9,0x02] -; CHECK-NEXT: vpcmpunorduq %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xe1,0x03] -; CHECK-NEXT: vpcmpnequq %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xe9,0x04] -; CHECK-NEXT: vpcmpnltuq %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf1,0x05] -; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf9,0x06] -; CHECK-NEXT: vpcmporduq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc9,0x07] +; CHECK-NEXT: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd1,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpleuq %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd9,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunorduq %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xe1,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpnequq %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xe9,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltuq %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf1,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf9,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmporduq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask) @@ -396,31 +396,31 @@ define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: test_cmp_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x00] -; CHECK-NEXT: vpcmpltd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x01] -; CHECK-NEXT: vpcmpled %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd1,0x02] -; CHECK-NEXT: vpcmpunordd %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd9,0x03] -; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe1,0x04] -; CHECK-NEXT: vpcmpnltd %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe9,0x05] -; CHECK-NEXT: vpcmpnled %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xf1,0x06] -; CHECK-NEXT: vpcmpordd %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xf9,0x07] +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpled %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd1,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunordd %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd9,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe1,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltd %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe9,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnled %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xf1,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmpordd %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xf9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1) @@ -444,31 +444,31 @@ ; CHECK-LABEL: test_mask_cmp_d_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc1,0x00] -; CHECK-NEXT: vpcmpltd %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd1,0x01] -; CHECK-NEXT: vpcmpled %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd9,0x02] -; CHECK-NEXT: vpcmpunordd %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe1,0x03] -; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe9,0x04] -; CHECK-NEXT: vpcmpnltd %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xf1,0x05] -; CHECK-NEXT: vpcmpnled %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xf9,0x06] -; CHECK-NEXT: vpcmpordd %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc9,0x07] +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltd %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd1,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpled %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd9,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunordd %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe1,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe9,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltd %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xf1,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnled %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xf9,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmpordd %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask) @@ -493,31 +493,31 @@ define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: test_ucmp_d_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpequd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x00] -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc9,0x01] -; CHECK-NEXT: vpcmpleud %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd1,0x02] -; CHECK-NEXT: vpcmpunordud %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd9,0x03] -; CHECK-NEXT: vpcmpnequd %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe1,0x04] -; CHECK-NEXT: vpcmpnltud %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe9,0x05] -; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xf1,0x06] -; CHECK-NEXT: vpcmpordud %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xf9,0x07] +; CHECK-NEXT: vpcmpequd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc9,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpleud %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd1,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunordud %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd9,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpnequd %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe1,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltud %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe9,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xf1,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmpordud %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xf9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1) @@ -541,31 +541,31 @@ ; CHECK-LABEL: test_mask_ucmp_d_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc1,0x00] -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd1,0x01] -; CHECK-NEXT: vpcmpleud %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd9,0x02] -; CHECK-NEXT: vpcmpunordud %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xe1,0x03] -; CHECK-NEXT: vpcmpnequd %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xe9,0x04] -; CHECK-NEXT: vpcmpnltud %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf1,0x05] -; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf9,0x06] -; CHECK-NEXT: vpcmpordud %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc9,0x07] +; CHECK-NEXT: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd1,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpleud %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd9,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunordud %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xe1,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpnequd %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xe9,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltud %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf1,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf9,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmpordud %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask) @@ -590,31 +590,31 @@ define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: test_cmp_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x00] -; CHECK-NEXT: vpcmpltq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x01] -; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd1,0x02] -; CHECK-NEXT: vpcmpunordq %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd9,0x03] -; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe1,0x04] -; CHECK-NEXT: vpcmpnltq %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe9,0x05] -; CHECK-NEXT: vpcmpnleq %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xf1,0x06] -; CHECK-NEXT: vpcmpordq %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xf9,0x07] +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd1,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunordq %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd9,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe1,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltq %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe9,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnleq %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xf1,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmpordq %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xf9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1) @@ -638,31 +638,31 @@ ; CHECK-LABEL: test_mask_cmp_q_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc1,0x00] -; CHECK-NEXT: vpcmpltq %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd1,0x01] -; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd9,0x02] -; CHECK-NEXT: vpcmpunordq %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe1,0x03] -; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe9,0x04] -; CHECK-NEXT: vpcmpnltq %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xf1,0x05] -; CHECK-NEXT: vpcmpnleq %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xf9,0x06] -; CHECK-NEXT: vpcmpordq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc9,0x07] +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltq %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd1,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd9,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunordq %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe1,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe9,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltq %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xf1,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnleq %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xf9,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmpordq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask) @@ -687,31 +687,31 @@ define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: test_ucmp_q_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpequq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x00] -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc9,0x01] -; CHECK-NEXT: vpcmpleuq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd1,0x02] -; CHECK-NEXT: vpcmpunorduq %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd9,0x03] -; CHECK-NEXT: vpcmpnequq %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe1,0x04] -; CHECK-NEXT: vpcmpnltuq %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe9,0x05] -; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xf1,0x06] -; CHECK-NEXT: vpcmporduq %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xf9,0x07] +; CHECK-NEXT: vpcmpequq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc9,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpleuq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd1,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunorduq %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd9,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpnequq %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe1,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltuq %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe9,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xf1,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmporduq %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xf9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1) @@ -735,31 +735,31 @@ ; CHECK-LABEL: test_mask_ucmp_q_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc1,0x00] -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd1,0x01] -; CHECK-NEXT: vpcmpleuq %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd9,0x02] -; CHECK-NEXT: vpcmpunorduq %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xe1,0x03] -; CHECK-NEXT: vpcmpnequq %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xe9,0x04] -; CHECK-NEXT: vpcmpnltuq %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf1,0x05] -; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf9,0x06] -; CHECK-NEXT: vpcmporduq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc9,0x07] +; CHECK-NEXT: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc1,0x00][?:0.000000e+00] +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd1,0x01][?:0.000000e+00] +; CHECK-NEXT: vpcmpleuq %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd9,0x02][?:0.000000e+00] +; CHECK-NEXT: vpcmpunorduq %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xe1,0x03][?:0.000000e+00] +; CHECK-NEXT: vpcmpnequq %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xe9,0x04][?:0.000000e+00] +; CHECK-NEXT: vpcmpnltuq %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf1,0x05][?:0.000000e+00] +; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf9,0x06][?:0.000000e+00] +; CHECK-NEXT: vpcmporduq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc9,0x07][?:0.000000e+00] ; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1][1:1.00] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02][1:1.00] ; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04][1:1.00] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06][1:1.00] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08] +; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08][1:1.00] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] -; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a] +; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a][1:1.00] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] -; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c] +; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c][1:1.00] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask) %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask) @@ -786,7 +786,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8a,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask) ret void } @@ -798,7 +798,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vcompresspd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask) ret void } @@ -810,7 +810,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vcompressps %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x8a,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask) ret void } @@ -822,7 +822,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask) ret <8 x double> %res } @@ -834,8 +834,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcompresspd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask) ret <4 x double> %res } @@ -847,7 +847,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcompressps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8a,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask) ret <4 x float> %res } @@ -858,7 +858,7 @@ ; CHECK-LABEL: compr7: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1) ret void } @@ -866,7 +866,7 @@ define <4 x float> @compr8(<4 x float> %data) { ; CHECK-LABEL: compr8: ; CHECK: ## BB#0: -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1) ret <4 x float> %res } @@ -876,7 +876,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpcompressq %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask) ret void } @@ -888,7 +888,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask) ret <4 x i32> %res } @@ -902,17 +902,19 @@ ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: movq _xmm@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] ; CHECK-NEXT: ## fixup A - offset: 3, value: _xmm@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load -; CHECK-NEXT: vmovdqa (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00] +; CHECK-NEXT: ## [4:0.50] +; CHECK-NEXT: vmovdqa (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00][4:0.50] ; CHECK-NEXT: movq _k8@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] ; CHECK-NEXT: ## fixup A - offset: 3, value: _k8@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load -; CHECK-NEXT: movzbl (%rax), %eax ## encoding: [0x0f,0xb6,0x00] +; CHECK-NEXT: ## [4:0.50] +; CHECK-NEXT: movzbl (%rax), %eax ## encoding: [0x0f,0xb6,0x00][4:0.50] ; CHECK-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; CHECK-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] -; CHECK-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x44,0x24,0xd8] -; CHECK-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x4c,0x24,0xe8] -; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9][1:0.33] +; CHECK-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x44,0x24,0xd8][1:1.00] +; CHECK-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x4c,0x24,0xe8][1:1.00] +; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] entry: %.compoundliteral = alloca <2 x i64>, align 16 %res = alloca <4 x i32>, align 16 @@ -933,7 +935,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vexpandpd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask) ret <8 x double> %res } @@ -945,7 +947,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask) ret <4 x double> %res } @@ -957,7 +959,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask) ret <4 x float> %res } @@ -969,7 +971,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask) ret <8 x double> %res } @@ -981,8 +983,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vexpandpd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask) ret <4 x double> %res } @@ -994,7 +996,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vexpandps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x88,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask) ret <4 x float> %res } @@ -1004,8 +1006,8 @@ define <8 x double> @expand7(i8* %addr, <8 x double> %data) { ; CHECK-LABEL: expand7: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1) ret <8 x double> %res } @@ -1013,7 +1015,7 @@ define <4 x float> @expand8(<4 x float> %data) { ; CHECK-LABEL: expand8: ; CHECK: ## BB#0: -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1) ret <4 x float> %res } @@ -1023,7 +1025,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask) ret <8 x i64> %res } @@ -1035,7 +1037,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask) ret <4 x i32> %res } @@ -1045,8 +1047,8 @@ define <8 x i64> @expand11(i8* %addr) { ; CHECK-LABEL: expand11: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07][4:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> undef, i8 -1) ret <8 x i64> %res } @@ -1056,7 +1058,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpexpandq (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x89,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %laddr = bitcast i8* %addr to <8 x i64>* %data = load <8 x i64>, <8 x i64>* %laddr, align 1 %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64>zeroinitializer, i8 %mask) @@ -1068,8 +1070,8 @@ define < 2 x i64> @test_mask_mul_epi32_rr_128(< 4 x i32> %a, < 4 x i32> %b) { ; CHECK-LABEL: test_mask_mul_epi32_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0xc1][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1) ret < 2 x i64> %res } @@ -1079,8 +1081,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask) ret < 2 x i64> %res } @@ -1090,7 +1092,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask) ret < 2 x i64> %res } @@ -1098,8 +1100,8 @@ define < 2 x i64> @test_mask_mul_epi32_rm_128(< 4 x i32> %a, < 4 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_mul_epi32_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0x07][9:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load < 4 x i32>, < 4 x i32>* %ptr_b %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1) ret < 2 x i64> %res @@ -1110,8 +1112,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuldq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load < 4 x i32>, < 4 x i32>* %ptr_b %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask) ret < 2 x i64> %res @@ -1122,7 +1124,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load < 4 x i32>, < 4 x i32>* %ptr_b %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask) ret < 2 x i64> %res @@ -1132,7 +1134,7 @@ ; CHECK-LABEL: test_mask_mul_epi32_rmb_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x18,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0 %b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer @@ -1146,8 +1148,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0x28,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0 %b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer @@ -1161,7 +1163,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x99,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0 %b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, < 2 x i32> zeroinitializer @@ -1175,8 +1177,8 @@ define < 4 x i64> @test_mask_mul_epi32_rr_256(< 8 x i32> %a, < 8 x i32> %b) { ; CHECK-LABEL: test_mask_mul_epi32_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x28,0xc1][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1) ret < 4 x i64> %res } @@ -1186,8 +1188,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask) ret < 4 x i64> %res } @@ -1197,7 +1199,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask) ret < 4 x i64> %res } @@ -1205,8 +1207,8 @@ define < 4 x i64> @test_mask_mul_epi32_rm_256(< 8 x i32> %a, < 8 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_mul_epi32_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x28,0x07][9:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load < 8 x i32>, < 8 x i32>* %ptr_b %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1) ret < 4 x i64> %res @@ -1217,8 +1219,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuldq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load < 8 x i32>, < 8 x i32>* %ptr_b %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask) ret < 4 x i64> %res @@ -1229,7 +1231,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load < 8 x i32>, < 8 x i32>* %ptr_b %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask) ret < 4 x i64> %res @@ -1239,7 +1241,7 @@ ; CHECK-LABEL: test_mask_mul_epi32_rmb_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vpmuldq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x38,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0 %b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer @@ -1253,8 +1255,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuldq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x39,0x28,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0 %b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer @@ -1268,7 +1270,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuldq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xb9,0x28,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0 %b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer @@ -1282,8 +1284,8 @@ define < 2 x i64> @test_mask_mul_epu32_rr_128(< 4 x i32> %a, < 4 x i32> %b) { ; CHECK-LABEL: test_mask_mul_epu32_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf4,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf4,0xc1][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1) ret < 2 x i64> %res } @@ -1293,8 +1295,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask) ret < 2 x i64> %res } @@ -1304,7 +1306,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask) ret < 2 x i64> %res } @@ -1312,8 +1314,8 @@ define < 2 x i64> @test_mask_mul_epu32_rm_128(< 4 x i32> %a, < 4 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_mul_epu32_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf4,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf4,0x07][9:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load < 4 x i32>, < 4 x i32>* %ptr_b %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1) ret < 2 x i64> %res @@ -1324,8 +1326,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuludq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load < 4 x i32>, < 4 x i32>* %ptr_b %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask) ret < 2 x i64> %res @@ -1336,7 +1338,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load < 4 x i32>, < 4 x i32>* %ptr_b %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask) ret < 2 x i64> %res @@ -1346,7 +1348,7 @@ ; CHECK-LABEL: test_mask_mul_epu32_rmb_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpmuludq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xf4,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0 %b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer @@ -1360,8 +1362,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuludq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xf4,0x0f] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0 %b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer @@ -1375,7 +1377,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuludq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xf4,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0 %b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, < 2 x i32> zeroinitializer @@ -1389,8 +1391,8 @@ define < 4 x i64> @test_mask_mul_epu32_rr_256(< 8 x i32> %a, < 8 x i32> %b) { ; CHECK-LABEL: test_mask_mul_epu32_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0xc1][5:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1) ret < 4 x i64> %res } @@ -1400,8 +1402,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask) ret < 4 x i64> %res } @@ -1411,7 +1413,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask) ret < 4 x i64> %res } @@ -1419,8 +1421,8 @@ define < 4 x i64> @test_mask_mul_epu32_rm_256(< 8 x i32> %a, < 8 x i32>* %ptr_b) { ; CHECK-LABEL: test_mask_mul_epu32_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0x07][9:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load < 8 x i32>, < 8 x i32>* %ptr_b %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1) ret < 4 x i64> %res @@ -1431,8 +1433,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuludq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load < 8 x i32>, < 8 x i32>* %ptr_b %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask) ret < 4 x i64> %res @@ -1443,7 +1445,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %b = load < 8 x i32>, < 8 x i32>* %ptr_b %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask) ret < 4 x i64> %res @@ -1453,7 +1455,7 @@ ; CHECK-LABEL: test_mask_mul_epu32_rmb_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vpmuludq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xf4,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0 %b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer @@ -1467,8 +1469,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuludq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xf4,0x0f] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0 %b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer @@ -1482,7 +1484,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmuludq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xf4,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0 %b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer @@ -1499,7 +1501,7 @@ ; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x02] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, i8 -1) ret i8 %res } @@ -1511,7 +1513,7 @@ ; CHECK-NEXT: vcmpleps %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, i8 -1) ret i8 %res } @@ -1523,7 +1525,7 @@ ; CHECK-NEXT: vcmplepd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc2,0xc1,0x02] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, i8 -1) ret i8 %res } @@ -1535,7 +1537,7 @@ ; CHECK-NEXT: vcmplepd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc2,0xc1,0x02] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, i8 -1) ret i8 %res } @@ -1546,7 +1548,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = bitcast i8 %mask to <8 x i1> %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer @@ -1558,8 +1560,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5f,0xd1] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = bitcast i8 %mask to <8 x i1> %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %src @@ -1569,8 +1571,8 @@ define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_max_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5f,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) ret <8 x float> %1 } @@ -1581,7 +1583,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) %2 = bitcast i8 %mask to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> @@ -1594,8 +1596,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5f,0xd1] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) %2 = bitcast i8 %mask to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> @@ -1606,8 +1608,8 @@ define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_max_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ret <4 x float> %1 } @@ -1618,7 +1620,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5d,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = bitcast i8 %mask to <8 x i1> %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer @@ -1630,8 +1632,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5d,0xd1] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = bitcast i8 %mask to <8 x i1> %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %src @@ -1641,8 +1643,8 @@ define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_min_ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5d,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5d,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) ret <8 x float> %1 } @@ -1653,7 +1655,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5d,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) %2 = bitcast i8 %mask to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> @@ -1666,8 +1668,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5d,0xd1] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) %2 = bitcast i8 %mask to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> @@ -1678,8 +1680,8 @@ define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_mm512_min_ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ret <4 x float> %1 } @@ -1690,7 +1692,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask) ret <4 x double> %res } @@ -1701,7 +1703,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) ret <8 x float> %res } @@ -1712,7 +1714,7 @@ ; CHECK-LABEL: test_getexp_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vgetexppd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x42,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1) ret <4 x double> %res } @@ -1723,7 +1725,7 @@ ; CHECK-LABEL: test_getexp_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vgetexpps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x42,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1) ret <8 x float> %res } @@ -1735,11 +1737,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9][1:0.33] ; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xca] -; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -1752,11 +1754,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9][1:0.33] ; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xca] -; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -1769,11 +1771,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9][1:0.33] ; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xca] -; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -1786,11 +1788,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9][1:0.33] ; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xca] -; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -1803,11 +1805,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9] +; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9][1:1.00] ; CHECK-NEXT: vpermi2pd %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x77,0xda] ; CHECK-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x77,0xca] -; CHECK-NEXT: vaddpd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -1820,11 +1822,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9] +; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9][1:1.00] ; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x77,0xda] ; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x77,0xca] -; CHECK-NEXT: vaddpd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -1837,11 +1839,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9] +; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9][1:1.00] ; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x77,0xda] ; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca] -; CHECK-NEXT: vaddps %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -1853,8 +1855,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %x1cast = bitcast <2 x i64> %x1 to <4 x i32> %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1cast, <4 x float> %x2, i8 %x3) ret <4 x float> %res @@ -1866,11 +1868,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9] +; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9][1:1.00] ; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x77,0xda] ; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x77,0xca] -; CHECK-NEXT: vaddps %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -1885,8 +1887,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpabsq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x1f,0xc8] ; CHECK-NEXT: vpabsq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x1f,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -1901,8 +1903,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpabsq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x1f,0xc8] ; CHECK-NEXT: vpabsq %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x1f,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -1916,9 +1918,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpabsd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1e,0xc8] -; CHECK-NEXT: vpabsd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1e,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpabsd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1e,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -1932,9 +1934,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpabsd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1e,0xc8] -; CHECK-NEXT: vpabsd %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1e,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpabsd %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1e,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -1949,8 +1951,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vscalefpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x2c,0xd1] ; CHECK-NEXT: vscalefpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x2c,0xc1] -; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -1965,8 +1967,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vscalefpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x2c,0xd1] ; CHECK-NEXT: vscalefpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x2c,0xc1] -; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -1981,8 +1983,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vscalefps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2c,0xd1] ; CHECK-NEXT: vscalefps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x2c,0xc1] -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -1997,8 +1999,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vscalefps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2c,0xd1] ; CHECK-NEXT: vscalefps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x2c,0xc1] -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -2014,9 +2016,9 @@ ; CHECK-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc2] ; CHECK-NEXT: vpmovqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1] ; CHECK-NEXT: vpmovqb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) @@ -2033,7 +2035,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovqb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x32,0x07] ; CHECK-NEXT: vpmovqb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x32,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) ret void @@ -2048,9 +2050,9 @@ ; CHECK-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc2] ; CHECK-NEXT: vpmovsqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1] ; CHECK-NEXT: vpmovsqb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) @@ -2067,7 +2069,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovsqb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x22,0x07] ; CHECK-NEXT: vpmovsqb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x22,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) ret void @@ -2082,9 +2084,9 @@ ; CHECK-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc2] ; CHECK-NEXT: vpmovusqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1] ; CHECK-NEXT: vpmovusqb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) @@ -2101,7 +2103,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovusqb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x12,0x07] ; CHECK-NEXT: vpmovusqb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x12,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) ret void @@ -2116,9 +2118,9 @@ ; CHECK-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc2] ; CHECK-NEXT: vpmovqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1] ; CHECK-NEXT: vpmovqb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) @@ -2135,7 +2137,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovqb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x32,0x07] ; CHECK-NEXT: vpmovqb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x32,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) ret void @@ -2150,9 +2152,9 @@ ; CHECK-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc2] ; CHECK-NEXT: vpmovsqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1] ; CHECK-NEXT: vpmovsqb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) @@ -2169,7 +2171,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovsqb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x22,0x07] ; CHECK-NEXT: vpmovsqb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x22,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) ret void @@ -2184,9 +2186,9 @@ ; CHECK-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc2] ; CHECK-NEXT: vpmovusqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1] ; CHECK-NEXT: vpmovusqb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) @@ -2203,7 +2205,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovusqb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x12,0x07] ; CHECK-NEXT: vpmovusqb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x12,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) ret void @@ -2218,9 +2220,9 @@ ; CHECK-NEXT: vpmovqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc2] ; CHECK-NEXT: vpmovqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1] ; CHECK-NEXT: vpmovqw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -2237,7 +2239,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovqw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x34,0x07] ; CHECK-NEXT: vpmovqw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x34,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) ret void @@ -2252,9 +2254,9 @@ ; CHECK-NEXT: vpmovsqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc2] ; CHECK-NEXT: vpmovsqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1] ; CHECK-NEXT: vpmovsqw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -2271,7 +2273,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovsqw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x24,0x07] ; CHECK-NEXT: vpmovsqw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x24,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) ret void @@ -2286,9 +2288,9 @@ ; CHECK-NEXT: vpmovusqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc2] ; CHECK-NEXT: vpmovusqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1] ; CHECK-NEXT: vpmovusqw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -2305,7 +2307,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovusqw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x14,0x07] ; CHECK-NEXT: vpmovusqw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x14,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) ret void @@ -2320,9 +2322,9 @@ ; CHECK-NEXT: vpmovqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc2] ; CHECK-NEXT: vpmovqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1] ; CHECK-NEXT: vpmovqw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -2339,7 +2341,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovqw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x34,0x07] ; CHECK-NEXT: vpmovqw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x34,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) ret void @@ -2354,9 +2356,9 @@ ; CHECK-NEXT: vpmovsqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc2] ; CHECK-NEXT: vpmovsqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1] ; CHECK-NEXT: vpmovsqw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -2373,7 +2375,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovsqw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x24,0x07] ; CHECK-NEXT: vpmovsqw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x24,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) ret void @@ -2388,9 +2390,9 @@ ; CHECK-NEXT: vpmovusqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc2] ; CHECK-NEXT: vpmovusqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1] ; CHECK-NEXT: vpmovusqw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -2407,7 +2409,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovusqw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x14,0x07] ; CHECK-NEXT: vpmovusqw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x14,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) ret void @@ -2422,9 +2424,9 @@ ; CHECK-NEXT: vpmovqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x35,0xc2] ; CHECK-NEXT: vpmovqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x35,0xc1] ; CHECK-NEXT: vpmovqd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x35,0xc0] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1][1:0.50] +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) @@ -2441,7 +2443,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovqd %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x35,0x07] ; CHECK-NEXT: vpmovqd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x35,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) ret void @@ -2456,9 +2458,9 @@ ; CHECK-NEXT: vpmovsqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x25,0xc2] ; CHECK-NEXT: vpmovsqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x25,0xc1] ; CHECK-NEXT: vpmovsqd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x25,0xc0] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1][1:0.50] +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) @@ -2475,7 +2477,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovsqd %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x25,0x07] ; CHECK-NEXT: vpmovsqd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x25,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) ret void @@ -2490,9 +2492,9 @@ ; CHECK-NEXT: vpmovusqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x15,0xc2] ; CHECK-NEXT: vpmovusqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x15,0xc1] ; CHECK-NEXT: vpmovusqd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x15,0xc0] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1][1:0.50] +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) @@ -2509,7 +2511,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovusqd %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x15,0x07] ; CHECK-NEXT: vpmovusqd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x15,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) ret void @@ -2524,9 +2526,9 @@ ; CHECK-NEXT: vpmovqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc2] ; CHECK-NEXT: vpmovqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1] ; CHECK-NEXT: vpmovqd %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc0] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1][1:0.50] +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) @@ -2543,7 +2545,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovqd %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x35,0x07] ; CHECK-NEXT: vpmovqd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x35,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) ret void @@ -2558,9 +2560,9 @@ ; CHECK-NEXT: vpmovsqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x25,0xc2] ; CHECK-NEXT: vpmovsqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x25,0xc1] ; CHECK-NEXT: vpmovsqd %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x25,0xc0] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1][1:0.50] +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) @@ -2577,7 +2579,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovsqd %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x25,0x07] ; CHECK-NEXT: vpmovsqd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x25,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) ret void @@ -2592,9 +2594,9 @@ ; CHECK-NEXT: vpmovusqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x15,0xc2] ; CHECK-NEXT: vpmovusqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x15,0xc1] ; CHECK-NEXT: vpmovusqd %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x15,0xc0] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1][1:0.50] +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) @@ -2611,7 +2613,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovusqd %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x15,0x07] ; CHECK-NEXT: vpmovusqd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x15,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) ret void @@ -2626,9 +2628,9 @@ ; CHECK-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc2] ; CHECK-NEXT: vpmovdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1] ; CHECK-NEXT: vpmovdb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) @@ -2645,7 +2647,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovdb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x31,0x07] ; CHECK-NEXT: vpmovdb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x31,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) ret void @@ -2660,9 +2662,9 @@ ; CHECK-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc2] ; CHECK-NEXT: vpmovsdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1] ; CHECK-NEXT: vpmovsdb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) @@ -2679,7 +2681,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovsdb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x21,0x07] ; CHECK-NEXT: vpmovsdb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x21,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) ret void @@ -2694,9 +2696,9 @@ ; CHECK-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc2] ; CHECK-NEXT: vpmovusdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1] ; CHECK-NEXT: vpmovusdb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) @@ -2713,7 +2715,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovusdb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x11,0x07] ; CHECK-NEXT: vpmovusdb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x11,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) ret void @@ -2728,9 +2730,9 @@ ; CHECK-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc2] ; CHECK-NEXT: vpmovdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1] ; CHECK-NEXT: vpmovdb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) @@ -2747,7 +2749,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovdb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x31,0x07] ; CHECK-NEXT: vpmovdb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x31,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) ret void @@ -2762,9 +2764,9 @@ ; CHECK-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc2] ; CHECK-NEXT: vpmovsdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1] ; CHECK-NEXT: vpmovsdb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) @@ -2781,7 +2783,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovsdb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x21,0x07] ; CHECK-NEXT: vpmovsdb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x21,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) ret void @@ -2796,9 +2798,9 @@ ; CHECK-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc2] ; CHECK-NEXT: vpmovusdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1] ; CHECK-NEXT: vpmovusdb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc0] -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1] -; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1][1:0.50] +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) @@ -2815,7 +2817,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovusdb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x11,0x07] ; CHECK-NEXT: vpmovusdb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x11,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) ret void @@ -2830,9 +2832,9 @@ ; CHECK-NEXT: vpmovdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc2] ; CHECK-NEXT: vpmovdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1] ; CHECK-NEXT: vpmovdw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -2849,7 +2851,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovdw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x33,0x07] ; CHECK-NEXT: vpmovdw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x33,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) ret void @@ -2864,9 +2866,9 @@ ; CHECK-NEXT: vpmovsdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc2] ; CHECK-NEXT: vpmovsdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1] ; CHECK-NEXT: vpmovsdw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -2883,7 +2885,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovsdw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x23,0x07] ; CHECK-NEXT: vpmovsdw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x23,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) ret void @@ -2898,9 +2900,9 @@ ; CHECK-NEXT: vpmovusdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc2] ; CHECK-NEXT: vpmovusdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1] ; CHECK-NEXT: vpmovusdw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -2917,7 +2919,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovusdw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x13,0x07] ; CHECK-NEXT: vpmovusdw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x13,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) ret void @@ -2932,9 +2934,9 @@ ; CHECK-NEXT: vpmovdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc2] ; CHECK-NEXT: vpmovdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1] ; CHECK-NEXT: vpmovdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -2951,7 +2953,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovdw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x33,0x07] ; CHECK-NEXT: vpmovdw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x33,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) ret void @@ -2966,9 +2968,9 @@ ; CHECK-NEXT: vpmovsdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc2] ; CHECK-NEXT: vpmovsdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1] ; CHECK-NEXT: vpmovsdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -2985,7 +2987,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovsdw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x23,0x07] ; CHECK-NEXT: vpmovsdw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x23,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) ret void @@ -3000,9 +3002,9 @@ ; CHECK-NEXT: vpmovusdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc2] ; CHECK-NEXT: vpmovusdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1] ; CHECK-NEXT: vpmovusdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc0] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1][1:0.50] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) @@ -3019,7 +3021,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpmovusdw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x13,0x07] ; CHECK-NEXT: vpmovusdw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x13,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) ret void @@ -3032,9 +3034,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5b,0xc8] -; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5b,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5b,0xc0][4:1.00] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -3048,9 +3050,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5b,0xc8] -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5b,0xc0] -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5b,0xc0][4:1.00] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -3064,9 +3066,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0xe6,0xc8] -; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0][4:1.00] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3078,11 +3080,10 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0xe6,0xc8] -; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9] -; CHECK-NEXT: ## xmm1 = xmm1[0],zero -; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]xmm1 = xmm1[0],zero[1:0.33] +; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0][4:1.00] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) @@ -3098,9 +3099,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0xe6,0xc8] -; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xc0][6:1.00] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3114,9 +3115,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x5a,0xc8] -; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xc0][5:1.00] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 %x2) %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -3130,9 +3131,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0][4:1.00] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -3144,11 +3145,10 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9] -; CHECK-NEXT: ## xmm1 = xmm1[0],zero -; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]xmm1 = xmm1[0],zero[1:0.33] +; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0][4:1.00] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1) @@ -3165,8 +3165,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] ; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3178,11 +3178,10 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] -; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9] -; CHECK-NEXT: ## xmm1 = xmm1[0],zero +; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]xmm1 = xmm1[0],zero[1:0.33] ; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) @@ -3199,8 +3198,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x29,0x79,0xc8] ; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x28,0x79,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3214,9 +3213,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtps2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x5b,0xc8] -; CHECK-NEXT: vcvtps2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5b,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtps2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5b,0xc0][3:1.00] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3230,9 +3229,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtps2dq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x5b,0xc8] -; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5b,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5b,0xc0][3:1.00] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -3246,9 +3245,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtps2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5a,0xc8] -; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0xc0] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0xc0][2:1.00] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 %x2) %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -3262,9 +3261,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtps2pd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5a,0xc8] -; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0xc0] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0xc0][5:1.00] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 %x2) %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -3279,8 +3278,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtps2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x79,0xc8] ; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x79,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3295,8 +3294,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtps2udq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x79,0xc8] ; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x79,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -3310,9 +3309,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0][4:1.00] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3324,11 +3323,10 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9] -; CHECK-NEXT: ## xmm1 = xmm1[0],zero -; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]xmm1 = xmm1[0],zero[1:0.33] +; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0][4:1.00] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) @@ -3344,9 +3342,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8] -; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0][6:1.00] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3361,8 +3359,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] ; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3374,11 +3372,10 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] -; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9] -; CHECK-NEXT: ## xmm1 = xmm1[0],zero +; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]xmm1 = xmm1[0],zero[1:0.33] ; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) @@ -3395,8 +3392,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x29,0x78,0xc8] ; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x28,0x78,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3410,9 +3407,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8] -; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xc0][3:1.00] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3426,9 +3423,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8] -; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xc0][3:1.00] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -3443,8 +3440,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x78,0xc8] ; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x78,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3459,8 +3456,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x78,0xc8] ; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x78,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -3475,8 +3472,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x7a,0xc8] ; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7f,0x08,0x7a,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -3491,8 +3488,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x7a,0xc8] ; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7f,0x28,0x7a,0xc0] -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -3507,8 +3504,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrndscalepd $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x09,0xc8,0x04] ; CHECK-NEXT: vrndscalepd $88, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x09,0xc0,0x58] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 88, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -3523,8 +3520,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrndscalepd $4, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x09,0xc8,0x04] ; CHECK-NEXT: vrndscalepd $88, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x09,0xc0,0x58] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 88, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -3539,8 +3536,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrndscaleps $88, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x08,0xc8,0x58] ; CHECK-NEXT: vrndscaleps $4, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x08,0xc0,0x04] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 4, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -3555,8 +3552,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrndscaleps $5, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x08,0xc8,0x05] ; CHECK-NEXT: vrndscaleps $66, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x08,0xc0,0x42] -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 5, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 66, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -3569,15 +3566,12 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd9,0x16] -; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xd1,0x16] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc1,0x16] -; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd9,0x16]ymm3 {%k1} {z} = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xd1,0x16]ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc1,0x16]ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) %res2 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> zeroinitializer, i8 %x4) @@ -3592,15 +3586,12 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd9,0x16] -; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1],ymm1[2,3] -; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xd1,0x16] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3] -; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc1,0x16] -; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3] -; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd9,0x16]ymm3 {%k1} {z} = ymm0[0,1],ymm1[2,3] +; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xd1,0x16]ymm2 {%k1} = ymm0[0,1],ymm1[2,3] +; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc1,0x16]ymm0 = ymm0[0,1],ymm1[2,3] +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1) %res2 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> zeroinitializer, i8 %x4) @@ -3615,12 +3606,10 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xd1,0x16] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc1,0x16] -; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xd1,0x16]ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc1,0x16]ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) %res1 = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -3633,12 +3622,10 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xd1,0x16] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3] -; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc1,0x16] -; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xd1,0x16]ymm2 {%k1} = ymm0[0,1],ymm1[2,3] +; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc1,0x16]ymm0 = ymm0[0,1],ymm1[2,3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4) %res1 = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -3654,9 +3641,9 @@ ; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0x89,0x26,0xd0,0x0b] ; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x26,0xc8,0x0b] ; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x26,0xc0,0x0b] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 %x3) %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> zeroinitializer, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 -1) @@ -3673,8 +3660,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x26,0xc8,0x0b] ; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x26,0xc0,0x0b] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -3689,8 +3676,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x26,0xc8,0x0b] ; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x26,0xc0,0x0b] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 11, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 11, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -3705,8 +3692,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x26,0xc8,0x0b] ; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x26,0xc0,0x0b] -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -3719,11 +3706,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8][1:0.33] ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xda,0x21] ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21] -; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3736,11 +3723,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8][1:0.33] ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xda,0x21] ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21] -; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4) %res1 = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3753,11 +3740,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8][1:0.33] ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xda,0x21] ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21] -; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -3770,11 +3757,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8][1:0.33] ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xda,0x21] ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21] -; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4) %res1 = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -3787,11 +3774,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8][1:0.33] ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xda,0x21] ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21] -; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -3804,11 +3791,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8][1:0.33] ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xda,0x21] ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21] -; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4) %res1 = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -3821,11 +3808,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8][1:0.33] ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xda,0x21] ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21] -; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -3838,11 +3825,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8][1:0.33] ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xda,0x21] ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21] -; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4) %res1 = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -3852,8 +3839,8 @@ define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) { ; CHECK-LABEL: test_x86_vcvtph2ps_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0][4:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 -1) ret <4 x float> %res } @@ -3863,8 +3850,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> %a1, i8 %mask) ret <4 x float> %res } @@ -3875,7 +3862,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x13,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 %mask) ret <4 x float> %res } @@ -3885,8 +3872,8 @@ define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) { ; CHECK-LABEL: test_x86_vcvtph2ps_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtph2ps %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtph2ps %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0][4:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 -1) ret <8 x float> %res } @@ -3896,8 +3883,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtph2ps %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> %a1, i8 %mask) ret <8 x float> %res } @@ -3907,7 +3894,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x13,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 %mask) ret <8 x float> %res } @@ -3920,10 +3907,10 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02] ; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x02] -; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0xfd,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02][4:1.00] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0xfd,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask) %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> %src, i8 %mask) @@ -3940,10 +3927,10 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02] ; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x02] -; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02] -; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] -; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0xfd,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02][4:1.00] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2][1:0.50] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0xfd,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask) %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> %src, i8 %mask) @@ -3958,7 +3945,7 @@ ; CHECK-LABEL: test_rsqrt_ps_256_rr: ; CHECK: ## BB#0: ; CHECK-NEXT: vrsqrt14ps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x4e,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1) ret <8 x float> %res } @@ -3968,7 +3955,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt14ps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x4e,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) ret <8 x float> %res } @@ -3978,8 +3965,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt14ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x4e,0xc8] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask) ret <8 x float> %res } @@ -3988,7 +3975,7 @@ ; CHECK-LABEL: test_rsqrt_ps_128_rr: ; CHECK: ## BB#0: ; CHECK-NEXT: vrsqrt14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4e,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ret <4 x float> %res } @@ -3998,7 +3985,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt14ps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x4e,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) ret <4 x float> %res } @@ -4008,8 +3995,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt14ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x4e,0xc8] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) ret <4 x float> %res } @@ -4021,7 +4008,7 @@ ; CHECK-LABEL: test_rcp_ps_256_rr: ; CHECK: ## BB#0: ; CHECK-NEXT: vrcp14ps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x4c,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1) ret <8 x float> %res } @@ -4031,7 +4018,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrcp14ps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x4c,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) ret <8 x float> %res } @@ -4041,8 +4028,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrcp14ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x4c,0xc8] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask) ret <8 x float> %res } @@ -4051,7 +4038,7 @@ ; CHECK-LABEL: test_rcp_ps_128_rr: ; CHECK: ## BB#0: ; CHECK-NEXT: vrcp14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4c,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ret <4 x float> %res } @@ -4061,7 +4048,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrcp14ps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x4c,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) ret <4 x float> %res } @@ -4071,8 +4058,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrcp14ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x4c,0xc8] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) ret <4 x float> %res } @@ -4084,7 +4071,7 @@ ; CHECK-LABEL: test_rsqrt_pd_256_rr: ; CHECK: ## BB#0: ; CHECK-NEXT: vrsqrt14pd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x4e,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1) ret <4 x double> %res } @@ -4094,7 +4081,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt14pd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x4e,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask) ret <4 x double> %res } @@ -4104,8 +4091,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt14pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x4e,0xc8] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask) ret <4 x double> %res } @@ -4114,7 +4101,7 @@ ; CHECK-LABEL: test_rsqrt_pd_128_rr: ; CHECK: ## BB#0: ; CHECK-NEXT: vrsqrt14pd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x4e,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1) ret <2 x double> %res } @@ -4124,7 +4111,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt14pd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x4e,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask) ret <2 x double> %res } @@ -4134,8 +4121,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt14pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x4e,0xc8] -; CHECK-NEXT: vmovapd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovapd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask) ret <2 x double> %res } @@ -4147,7 +4134,7 @@ ; CHECK-LABEL: test_rcp_pd_256_rr: ; CHECK: ## BB#0: ; CHECK-NEXT: vrcp14pd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x4c,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1) ret <4 x double> %res } @@ -4157,7 +4144,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrcp14pd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x4c,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask) ret <4 x double> %res } @@ -4167,8 +4154,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrcp14pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x4c,0xc8] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask) ret <4 x double> %res } @@ -4177,7 +4164,7 @@ ; CHECK-LABEL: test_rcp_pd_128_rr: ; CHECK: ## BB#0: ; CHECK-NEXT: vrcp14pd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x4c,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1) ret <2 x double> %res } @@ -4187,7 +4174,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrcp14pd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x4c,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask) ret <2 x double> %res } @@ -4197,8 +4184,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrcp14pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x4c,0xc8] -; CHECK-NEXT: vmovapd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovapd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1][1:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask) ret <2 x double> %res } @@ -4213,15 +4200,12 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd0,0x00] -; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xc8,0x00] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc0,0x00] -; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd0,0x00]ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xc8,0x00]ymm1 {%k1} = ymm0[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc0,0x00]ymm0 = ymm0[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1][3:1.00] +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1) %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask) %res3 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %mask) @@ -4234,10 +4218,9 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256_load: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] -; CHECK-NEXT: vshuff32x4 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x23,0xc1,0x00] -; CHECK-NEXT: ## ymm0 {%k1} = ymm1[0,1,2,3,0,1,2,3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f][4:0.50] +; CHECK-NEXT: vshuff32x4 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x23,0xc1,0x00]ymm0 {%k1} = ymm1[0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %x0 = load <4 x float>, <4 x float>* %x0ptr %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask) ret <8 x float> %res @@ -4250,15 +4233,12 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x43,0xd0,0x00] -; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xc8,0x00] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc0,0x00] -; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x43,0xd0,0x00]ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xc8,0x00]ymm1 {%k1} = ymm0[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc0,0x00]ymm0 = ymm0[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1][1:0.50] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1) %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) %res3 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask) @@ -4271,10 +4251,9 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256_load: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f] -; CHECK-NEXT: vshufi32x4 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x43,0xc1,0x00] -; CHECK-NEXT: ## ymm0 {%k1} = ymm1[0,1,2,3,0,1,2,3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f][4:0.50] +; CHECK-NEXT: vshufi32x4 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x43,0xc1,0x00]ymm0 {%k1} = ymm1[0,1,2,3,0,1,2,3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %x0 = load <4 x i32>, <4 x i32>* %x0ptr %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) ret <8 x i32> %res @@ -4289,9 +4268,9 @@ ; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x14,0xd9] ; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x14,0xd1] ; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x14,0xc1] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -4309,9 +4288,9 @@ ; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x14,0xd9] ; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x14,0xd1] ; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x14,0xc1] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -4329,9 +4308,9 @@ ; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x14,0xd9] ; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x14,0xd1] ; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x14,0xc1] -; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) %res2 = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -4349,9 +4328,9 @@ ; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x14,0xd9] ; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x14,0xd1] ; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x14,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -4369,9 +4348,9 @@ ; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca][1:0.50] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1) @@ -4389,9 +4368,9 @@ ; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] -; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca][1:0.50] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1) @@ -4409,9 +4388,9 @@ ; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] -; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca][1:0.50] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> zeroinitializer, i8 %x3) %res2 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 -1) @@ -4429,9 +4408,9 @@ ; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca][1:0.50] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1) @@ -4449,9 +4428,9 @@ ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x15,0xd9] ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x15,0xd1] ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x15,0xc1] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -4469,9 +4448,9 @@ ; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x15,0xd9] ; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x15,0xd1] ; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x15,0xc1] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -4489,9 +4468,9 @@ ; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x15,0xd9] ; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x15,0xd1] ; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x15,0xc1] -; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) %res2 = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -4509,9 +4488,9 @@ ; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x15,0xd9] ; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x15,0xd1] ; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x15,0xc1] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -4529,9 +4508,9 @@ ; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] -; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca][1:0.50] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1) @@ -4549,9 +4528,9 @@ ; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] -; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca][1:0.50] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1) @@ -4569,9 +4548,9 @@ ; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] -; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca][1:0.50] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> zeroinitializer, i8 %x3) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 -1) @@ -4589,9 +4568,9 @@ ; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca][1:0.50] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1) @@ -4609,9 +4588,9 @@ ; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x16,0xd8] ; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x16,0xd0] ; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] -; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3) %res2 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) @@ -4629,9 +4608,9 @@ ; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x36,0xd8] ; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x36,0xd0] ; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) %res2 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -4646,12 +4625,12 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xd8] +; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xd8][3:1.00] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0x16,0xd0] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x16,0xc0] -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3) %res2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) @@ -4666,12 +4645,12 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xd8] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xd8][3:1.00] ; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0x36,0xd0] ; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x36,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) %res2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -4686,14 +4665,14 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8] +; CHECK-NEXT: vmovapd %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8][1:1.00] ; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4][1:0.33] ; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04] ; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc][3:1.00] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4) %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 3, i8 -1) @@ -4708,12 +4687,12 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8] +; CHECK-NEXT: vmovapd %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8][1:1.00] ; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xda,0x05] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2][1:0.33] ; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 3, i8 %x4) ;%res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 -1) @@ -4728,14 +4707,14 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8] +; CHECK-NEXT: vmovapd %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8][1:1.00] ; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04] -; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] +; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4][1:0.33] ; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc][3:1.00] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4) %res2 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 3, i8 -1) @@ -4750,15 +4729,15 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8] +; CHECK-NEXT: vmovapd %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8][1:1.00] ; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xda,0x05] -; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] -; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8] +; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4][1:0.33] +; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8][1:1.00] ; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd][3:1.00] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4) %res2 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 3, i8 -1) @@ -4773,15 +4752,15 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8] +; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8][1:1.00] ; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xda,0x05] -; CHECK-NEXT: vmovaps %xmm0, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xe0] +; CHECK-NEXT: vmovaps %xmm0, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xe0][1:1.00] ; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x54,0xe2,0x05] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2][1:0.33] ; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x54,0xc2,0x05] -; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd8,0x58,0xc0] -; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd8,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4) %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4) %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1) @@ -4796,15 +4775,15 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8] +; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8][1:1.00] ; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xda,0x05] -; CHECK-NEXT: vmovaps %xmm0, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xe0] +; CHECK-NEXT: vmovaps %xmm0, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xe0][1:1.00] ; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x54,0xe2,0x05] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2][1:0.33] ; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x54,0xc2,0x05] -; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd8,0x58,0xc0] -; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd8,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4) %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4) %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1) @@ -4819,15 +4798,15 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8] +; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8][1:1.00] ; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xda,0x05] -; CHECK-NEXT: vmovaps %ymm0, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xe0] +; CHECK-NEXT: vmovaps %ymm0, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xe0][1:1.00] ; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm4 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x54,0xe2,0x05] -; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2] +; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2][1:0.33] ; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x54,0xc2,0x05] -; CHECK-NEXT: vaddps %ymm0, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdc,0x58,0xc0] -; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm0, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdc,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 5, i8 %x4) %res2 = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 -1) @@ -4842,15 +4821,15 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8] +; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8][1:1.00] ; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xda,0x05] -; CHECK-NEXT: vmovaps %ymm0, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xe0] +; CHECK-NEXT: vmovaps %ymm0, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xe0][1:1.00] ; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x54,0xe2,0x05] -; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2] +; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2][1:0.33] ; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x54,0xc2,0x05] -; CHECK-NEXT: vaddps %ymm0, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdc,0x58,0xc0] -; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm0, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdc,0x58,0xc0][3:1.00] +; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 5, i8 %x4) %res2 = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 -1) @@ -4869,9 +4848,9 @@ ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8][1:0.25] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1) %res2 = add i8 %res, %res1 @@ -4888,9 +4867,9 @@ ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8][1:0.25] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1) %res2 = add i8 %res, %res1 @@ -4907,9 +4886,9 @@ ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8][1:0.25] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1) %res2 = add i8 %res, %res1 @@ -4926,9 +4905,9 @@ ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8][1:0.25] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1) %res2 = add i8 %res, %res1 @@ -4945,9 +4924,9 @@ ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8][1:0.25] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1) %res2 = add i8 %res, %res1 @@ -4964,9 +4943,9 @@ ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8][1:0.25] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1) %res2 = add i8 %res, %res1 @@ -4983,9 +4962,9 @@ ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8][1:0.25] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1) %res2 = add i8 %res, %res1 @@ -5002,9 +4981,9 @@ ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8][1:0.25] ; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1) %res2 = add i8 %res, %res1 @@ -5020,9 +4999,9 @@ ; CHECK-NEXT: vpbroadcastd %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xcf] ; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc7] ; CHECK-NEXT: vpbroadcastd %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xd7] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1) %res1 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask) %res2 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask) @@ -5040,9 +5019,9 @@ ; CHECK-NEXT: vpbroadcastd %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xcf] ; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc7] ; CHECK-NEXT: vpbroadcastd %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xd7] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0][1:0.50] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask) %res2 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask) @@ -5060,9 +5039,9 @@ ; CHECK-NEXT: vpbroadcastq %rdi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xcf] ; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7c,0xc7] ; CHECK-NEXT: vpbroadcastq %rdi, %ymm2 ## encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xd7] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1) %res1 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask) %res2 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask) @@ -5080,9 +5059,9 @@ ; CHECK-NEXT: vpbroadcastq %rdi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xcf] ; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7c,0xc7] ; CHECK-NEXT: vpbroadcastq %rdi, %xmm2 ## encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd7] -; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0][1:0.50] +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0][1:0.50] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1) %res1 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask) %res2 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask) @@ -5096,7 +5075,7 @@ ; CHECK-LABEL: test_x86_avx512_psra_q_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe2,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -5105,8 +5084,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xe2,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> @@ -5118,7 +5097,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xe2,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> @@ -5132,7 +5111,7 @@ ; CHECK-LABEL: test_x86_avx512_psra_q_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xe2,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -5141,8 +5120,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xe2,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> @@ -5154,7 +5133,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xe2,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> @@ -5168,7 +5147,7 @@ ; CHECK-LABEL: test_x86_avx512_psrai_q_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpsraq $7, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xe0,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -5177,8 +5156,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsraq $7, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xe0,0x07] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> @@ -5190,7 +5169,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsraq $7, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x72,0xe0,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> @@ -5204,7 +5183,7 @@ ; CHECK-LABEL: test_x86_avx512_psrai_q_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vpsraq $7, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xe0,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -5213,8 +5192,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsraq $7, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xe0,0x07] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> @@ -5226,7 +5205,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsraq $7, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x72,0xe0,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> @@ -5239,7 +5218,7 @@ ; CHECK-LABEL: test_x86_avx512_psrav_q_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1) ret <2 x i64> %res } @@ -5249,8 +5228,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x46,0xd1] -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1) %mask.cast = bitcast i8 %mask to <8 x i1> %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> @@ -5263,7 +5242,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x46,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1) %mask.cast = bitcast i8 %mask to <8 x i1> %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> @@ -5277,7 +5256,7 @@ ; CHECK-LABEL: test_x86_avx512_psrav_q_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x46,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1) ret <4 x i64> %res } @@ -5287,8 +5266,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x46,0xd1] -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2][1:0.33] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1) %mask.cast = bitcast i8 %mask to <8 x i1> %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> @@ -5301,7 +5280,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x46,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1) %mask.cast = bitcast i8 %mask to <8 x i1> %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> @@ -5318,7 +5297,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind ret <8 x float> %res } @@ -5330,7 +5309,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind ret <4 x float> %res } @@ -5342,7 +5321,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) ret <4 x double> %res } @@ -5354,7 +5333,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) ret <2 x double> %res } @@ -5363,11 +5342,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9] +; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xda] ; CHECK-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1] -; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -5380,11 +5359,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9] +; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xda] ; CHECK-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd1] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -5397,11 +5376,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9] +; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xda] ; CHECK-NEXT: vfmadd213pd %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0xa8,0xca] -; CHECK-NEXT: vaddpd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -5412,11 +5391,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9] +; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xda] ; CHECK-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1] -; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -5429,11 +5408,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9] +; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xda] ; CHECK-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd1] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -5446,11 +5425,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9] +; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xda] ; CHECK-NEXT: vfmadd213pd %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0xa8,0xca] -; CHECK-NEXT: vaddpd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -5461,11 +5440,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9] +; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xda] ; CHECK-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1] -; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -5478,11 +5457,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9] +; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xda] ; CHECK-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd1] -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -5495,11 +5474,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9] +; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xda] ; CHECK-NEXT: vfmadd213ps %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0xa8,0xca] -; CHECK-NEXT: vaddps %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -5510,11 +5489,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9] +; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xda] ; CHECK-NEXT: vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1] -; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -5527,11 +5506,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9] +; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xda] ; CHECK-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd1] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -5544,11 +5523,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9] +; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xda] ; CHECK-NEXT: vfmadd213ps %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0xa8,0xca] -; CHECK-NEXT: vaddps %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -5562,11 +5541,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9] +; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaa,0xda] ; CHECK-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd1] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -5580,11 +5559,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9] +; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xaa,0xda] ; CHECK-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd1] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -5597,11 +5576,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9] +; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaa,0xda] ; CHECK-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd1] -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -5614,11 +5593,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9] +; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xaa,0xda] ; CHECK-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd1] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -5632,7 +5611,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind ret <8 x float> %res } @@ -5644,7 +5623,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind ret <4 x float> %res } @@ -5656,7 +5635,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind ret <4 x double> %res } @@ -5668,7 +5647,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind ret <2 x double> %res } @@ -5680,7 +5659,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind ret <8 x float> %res } @@ -5692,7 +5671,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind ret <4 x float> %res } @@ -5704,7 +5683,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind ret <4 x double> %res } @@ -5716,7 +5695,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind ret <2 x double> %res } @@ -5726,11 +5705,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9] +; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xae,0xda] ; CHECK-NEXT: vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1] -; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -5743,11 +5722,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9] +; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xae,0xda] ; CHECK-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd1] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -5758,11 +5737,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9] +; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xae,0xda] ; CHECK-NEXT: vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1] -; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -5775,11 +5754,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9] +; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xae,0xda] ; CHECK-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd1] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -5790,11 +5769,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9] +; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xae,0xda] ; CHECK-NEXT: vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1] -; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -5807,11 +5786,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9] +; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xae,0xda] ; CHECK-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd1] -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -5822,11 +5801,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9] +; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xae,0xda] ; CHECK-NEXT: vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1] -; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -5839,11 +5818,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9] +; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xae,0xda] ; CHECK-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd1] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -5854,11 +5833,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9] +; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xac,0xda] ; CHECK-NEXT: vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1] -; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -5869,11 +5848,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9] +; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xac,0xda] ; CHECK-NEXT: vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1] -; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -5884,11 +5863,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9] +; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xac,0xda] ; CHECK-NEXT: vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1] -; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -5899,11 +5878,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9] +; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xac,0xda] ; CHECK-NEXT: vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1] -; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -5917,7 +5896,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) ret <8 x float> %res } @@ -5929,7 +5908,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) ret <4 x float> %res } @@ -5941,7 +5920,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind ret <4 x double> %res } @@ -5953,7 +5932,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind ret <2 x double> %res } @@ -5962,11 +5941,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9] +; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xda] ; CHECK-NEXT: vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1] -; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -5979,11 +5958,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9] +; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xda] ; CHECK-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd1] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -5996,11 +5975,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9] +; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xda] ; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0xa6,0xca] -; CHECK-NEXT: vaddpd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -6011,11 +5990,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9] +; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xda] ; CHECK-NEXT: vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1] -; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -6028,11 +6007,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9] +; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xda] ; CHECK-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd1] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -6045,11 +6024,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9] +; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xda] ; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0xa6,0xca] -; CHECK-NEXT: vaddpd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -6060,11 +6039,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9] +; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xda] ; CHECK-NEXT: vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1] -; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -6077,11 +6056,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9] +; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xda] ; CHECK-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd1] -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -6094,11 +6073,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9] +; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xda] ; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0xa6,0xca] -; CHECK-NEXT: vaddps %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -6109,11 +6088,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9] +; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xda] ; CHECK-NEXT: vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1] -; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -6126,11 +6105,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9] +; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xda] ; CHECK-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd1] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -6143,11 +6122,11 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9] +; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xda] ; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0xa6,0xca] -; CHECK-NEXT: vaddps %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -6160,11 +6139,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9] +; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmsubadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa7,0xda] ; CHECK-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd1] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2=fadd <2 x double> %res, %res1 @@ -6177,11 +6156,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9] +; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmsubadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa7,0xda] ; CHECK-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd1] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2=fadd <4 x double> %res, %res1 @@ -6194,11 +6173,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_128: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9] +; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmsubadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa7,0xda] ; CHECK-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd1] -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2=fadd <4 x float> %res, %res1 @@ -6211,11 +6190,11 @@ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_256: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9] +; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9][1:1.00] ; CHECK-NEXT: vfmsubadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa7,0xda] ; CHECK-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd1] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3][3:1.00] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2=fadd <8 x float> %res, %res1 @@ -6228,7 +6207,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind ret <4 x float> %res } @@ -6237,7 +6216,7 @@ ; CHECK-LABEL: test_mask_vfmadd128_ps_rz: ; CHECK: ## BB#0: ; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind ret <4 x float> %res } @@ -6247,7 +6226,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %a2 = load <4 x float>, <4 x float>* %ptr_a2 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind ret <4 x float> %res @@ -6258,7 +6237,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind ret <4 x float> %res @@ -6268,7 +6247,7 @@ ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz: ; CHECK: ## BB#0: ; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %a2 = load <4 x float>, <4 x float>* %ptr_a2 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind ret <4 x float> %res @@ -6278,7 +6257,7 @@ ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza: ; CHECK: ## BB#0: ; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 4 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind ret <4 x float> %res @@ -6289,7 +6268,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load float, float* %ptr_a2 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 @@ -6304,7 +6283,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load float, float* %ptr_a2, align 4 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 @@ -6318,7 +6297,7 @@ ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz: ; CHECK: ## BB#0: ; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load float, float* %ptr_a2 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 @@ -6332,7 +6311,7 @@ ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza: ; CHECK: ## BB#0: ; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %q = load float, float* %ptr_a2, align 4 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 @@ -6347,7 +6326,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind ret <2 x double> %res } @@ -6356,7 +6335,7 @@ ; CHECK-LABEL: test_mask_vfmadd128_pd_rz: ; CHECK: ## BB#0: ; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind ret <2 x double> %res } @@ -6366,7 +6345,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %a2 = load <2 x double>, <2 x double>* %ptr_a2 %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind ret <2 x double> %res @@ -6376,7 +6355,7 @@ ; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz: ; CHECK: ## BB#0: ; CHECK-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %a2 = load <2 x double>, <2 x double>* %ptr_a2 %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind ret <2 x double> %res @@ -6387,7 +6366,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind ret <4 x double> %res } @@ -6396,7 +6375,7 @@ ; CHECK-LABEL: test_mask_vfmadd256_pd_rz: ; CHECK: ## BB#0: ; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind ret <4 x double> %res } @@ -6406,7 +6385,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %a2 = load <4 x double>, <4 x double>* %ptr_a2 %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind ret <4 x double> %res @@ -6416,7 +6395,7 @@ ; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz: ; CHECK: ## BB#0: ; CHECK-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0x07] -; CHECK-NEXT: retq ## encoding: [0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3][1:1.00] %a2 = load <4 x double>, <4 x double>* %ptr_a2 %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind ret <4 x double> %res Index: test/CodeGen/X86/compress_expand.ll =================================================================== --- test/CodeGen/X86/compress_expand.ll +++ test/CodeGen/X86/compress_expand.ll @@ -11,16 +11,18 @@ ; SKX-LABEL: test1: ; SKX: # BB#0: ; SKX-NEXT: movw $-2049, %ax # imm = 0xF7FF +; SKX-NEXT: # [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test1: ; KNL: # BB#0: ; KNL-NEXT: movw $-2049, %ax # imm = 0xF7FF +; KNL-NEXT: # [1:0.25] ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] %res = call <16 x float> @llvm.masked.expandload.v16f32(float* %base, <16 x i1> , <16 x float> undef) ret <16 x float>%res } @@ -29,16 +31,18 @@ ; SKX-LABEL: test2: ; SKX: # BB#0: ; SKX-NEXT: movw $30719, %ax # imm = 0x77FF +; SKX-NEXT: # [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vexpandps (%rdi), %zmm0 {%k1} -; SKX-NEXT: retq +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test2: ; KNL: # BB#0: ; KNL-NEXT: movw $30719, %ax # imm = 0x77FF +; KNL-NEXT: # [1:0.25] ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] %res = call <16 x float> @llvm.masked.expandload.v16f32(float* %base, <16 x i1> , <16 x float> %src0) ret <16 x float>%res } @@ -46,10 +50,10 @@ define <8 x double> @test3(double* %base, <8 x double> %src0, <8 x i1> %mask) { ; SKX-LABEL: test3: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 # [1:1.00] ; SKX-NEXT: vpmovw2m %xmm1, %k1 ; SKX-NEXT: vexpandpd (%rdi), %zmm0 {%k1} -; SKX-NEXT: retq +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test3: ; KNL: # BB#0: @@ -57,7 +61,7 @@ ; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL-NEXT: vexpandpd (%rdi), %zmm0 {%k1} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] %res = call <8 x double> @llvm.masked.expandload.v8f64(double* %base, <8 x i1> %mask, <8 x double> %src0) ret <8 x double>%res } @@ -65,19 +69,19 @@ define <4 x float> @test4(float* %base, <4 x float> %src0) { ; SKX-LABEL: test4: ; SKX: # BB#0: -; SKX-NEXT: movb $7, %al +; SKX-NEXT: movb $7, %al # [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vexpandps (%rdi), %xmm0 {%k1} -; SKX-NEXT: retq +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test4: ; KNL: # BB#0: ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: movw $7, %ax +; KNL-NEXT: movw $7, %ax # [1:0.25] ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1} ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] %res = call <4 x float> @llvm.masked.expandload.v4f32(float* %base, <4 x i1> , <4 x float> %src0) ret <4 x float>%res } @@ -85,19 +89,19 @@ define <2 x i64> @test5(i64* %base, <2 x i64> %src0) { ; SKX-LABEL: test5: ; SKX: # BB#0: -; SKX-NEXT: movb $2, %al +; SKX-NEXT: movb $2, %al # [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vpexpandq (%rdi), %xmm0 {%k1} -; SKX-NEXT: retq +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test5: ; KNL: # BB#0: ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: movb $2, %al +; KNL-NEXT: movb $2, %al # [1:0.25] ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpexpandq (%rdi), %zmm0 {%k1} ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] %res = call <2 x i64> @llvm.masked.expandload.v2i64(i64* %base, <2 x i1> , <2 x i64> %src0) ret <2 x i64>%res } @@ -111,17 +115,19 @@ ; SKX-LABEL: test6: ; SKX: # BB#0: ; SKX-NEXT: movw $-2049, %ax # imm = 0xF7FF +; SKX-NEXT: # [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vcompressps %zmm0, (%rdi) {%k1} -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper # [1:0.00] +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test6: ; KNL: # BB#0: ; KNL-NEXT: movw $-2049, %ax # imm = 0xF7FF +; KNL-NEXT: # [1:0.25] ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] call void @llvm.masked.compressstore.v16f32(<16 x float> %V, float* %base, <16 x i1> ) ret void } @@ -129,11 +135,11 @@ define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) { ; SKX-LABEL: test7: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 # [1:1.00] ; SKX-NEXT: vpmovw2m %xmm1, %k1 ; SKX-NEXT: vcompressps %ymm0, (%rdi) {%k1} -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper # [1:0.00] +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test7: ; KNL: # BB#0: @@ -144,7 +150,7 @@ ; KNL-NEXT: kshiftlw $8, %k0, %k0 ; KNL-NEXT: kshiftrw $8, %k0, %k1 ; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] call void @llvm.masked.compressstore.v8f32(<8 x float> %V, float* %base, <8 x i1> %mask) ret void } @@ -152,11 +158,11 @@ define void @test8(double* %base, <8 x double> %V, <8 x i1> %mask) { ; SKX-LABEL: test8: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 # [1:1.00] ; SKX-NEXT: vpmovw2m %xmm1, %k1 ; SKX-NEXT: vcompresspd %zmm0, (%rdi) {%k1} -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper # [1:0.00] +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test8: ; KNL: # BB#0: @@ -164,7 +170,7 @@ ; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL-NEXT: vcompresspd %zmm0, (%rdi) {%k1} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] call void @llvm.masked.compressstore.v8f64(<8 x double> %V, double* %base, <8 x i1> %mask) ret void } @@ -172,11 +178,11 @@ define void @test9(i64* %base, <8 x i64> %V, <8 x i1> %mask) { ; SKX-LABEL: test9: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 # [1:1.00] ; SKX-NEXT: vpmovw2m %xmm1, %k1 ; SKX-NEXT: vpcompressq %zmm0, (%rdi) {%k1} -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper # [1:0.00] +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test9: ; KNL: # BB#0: @@ -184,7 +190,7 @@ ; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL-NEXT: vpcompressq %zmm0, (%rdi) {%k1} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] call void @llvm.masked.compressstore.v8i64(<8 x i64> %V, i64* %base, <8 x i1> %mask) ret void } @@ -192,24 +198,24 @@ define void @test10(i64* %base, <4 x i64> %V, <4 x i1> %mask) { ; SKX-LABEL: test10: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm1, %xmm1 +; SKX-NEXT: vpslld $31, %xmm1, %xmm1 # [1:1.00] ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 ; SKX-NEXT: vpcompressq %ymm0, (%rdi) {%k1} -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper # [1:0.00] +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test10: ; KNL: # BB#0: ; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; KNL-NEXT: vpslld $31, %xmm1, %xmm1 -; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 -; KNL-NEXT: vpmovsxdq %xmm1, %ymm1 -; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL-NEXT: vpslld $31, %xmm1, %xmm1 # [1:1.00] +; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 # [1:1.00] +; KNL-NEXT: vpmovsxdq %xmm1, %ymm1 # [3:1.00] +; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 # [?:0.000000e+00] ; KNL-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL-NEXT: vpcompressq %zmm0, (%rdi) {%k1} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] call void @llvm.masked.compressstore.v4i64(<4 x i64> %V, i64* %base, <4 x i1> %mask) ret void } @@ -217,22 +223,22 @@ define void @test11(i64* %base, <2 x i64> %V, <2 x i1> %mask) { ; SKX-LABEL: test11: ; SKX: # BB#0: -; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 +; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 # [1:1.00] ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX-NEXT: vpcompressq %xmm0, (%rdi) {%k1} -; SKX-NEXT: retq +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test11: ; KNL: # BB#0: ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: vpsllq $63, %xmm1, %xmm1 +; KNL-NEXT: vpsllq $63, %xmm1, %xmm1 # [1:1.00] ; KNL-NEXT: vpsraq $63, %zmm1, %zmm1 -; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 # [?:0.000000e+00] ; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1 ; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL-NEXT: vpcompressq %zmm0, (%rdi) {%k1} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] call void @llvm.masked.compressstore.v2i64(<2 x i64> %V, i64* %base, <2 x i1> %mask) ret void } @@ -240,22 +246,22 @@ define void @test12(float* %base, <4 x float> %V, <4 x i1> %mask) { ; SKX-LABEL: test12: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm1, %xmm1 +; SKX-NEXT: vpslld $31, %xmm1, %xmm1 # [1:1.00] ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 ; SKX-NEXT: vcompressps %xmm0, (%rdi) {%k1} -; SKX-NEXT: retq +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test12: ; KNL: # BB#0: ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: vpslld $31, %xmm1, %xmm1 -; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 -; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL-NEXT: vpslld $31, %xmm1, %xmm1 # [1:1.00] +; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 # [1:1.00] +; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 # [?:0.000000e+00] ; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1 ; KNL-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] call void @llvm.masked.compressstore.v4f32(<4 x float> %V, float* %base, <4 x i1> %mask) ret void } @@ -263,28 +269,28 @@ define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger) { ; SKX-LABEL: test13: ; SKX: # BB#0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k0 +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # [1:0.33] +; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3][1:0.33] +; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k0 # [?:0.000000e+00] ; SKX-NEXT: kshiftlb $6, %k0, %k0 ; SKX-NEXT: kshiftrb $6, %k0, %k1 ; SKX-NEXT: vexpandps (%rdi), %xmm0 {%k1} -; SKX-NEXT: retq +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test13: ; KNL: # BB#0: ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; KNL-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 # [1:0.33] +; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3][1:0.33] +; KNL-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 # [1:0.50] +; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero[1:1.00] +; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 # [?:0.000000e+00] ; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1 ; KNL-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1} ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x float> @llvm.masked.expandload.v2f32(float* %base, <2 x i1> %mask, <2 x float> %src0) ret <2 x float> %res @@ -293,27 +299,27 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) { ; SKX-LABEL: test14: ; SKX: # BB#0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k0 +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # [1:0.33] +; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3][1:0.33] +; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k0 # [?:0.000000e+00] ; SKX-NEXT: kshiftlb $6, %k0, %k0 ; SKX-NEXT: kshiftrb $6, %k0, %k1 ; SKX-NEXT: vcompressps %xmm0, (%rdi) {%k1} -; SKX-NEXT: retq +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test14: ; KNL: # BB#0: ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; KNL-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 # [1:0.33] +; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3][1:0.33] +; KNL-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 # [1:0.50] +; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero[1:1.00] +; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 # [?:0.000000e+00] ; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1 ; KNL-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.compressstore.v2f32(<2 x float> %V, float* %base, <2 x i1> %mask) ret void @@ -322,14 +328,14 @@ define <32 x float> @test15(float* %base, <32 x float> %src0, <32 x i32> %trigger) { ; ALL-LABEL: test15: ; ALL: # BB#0: -; ALL-NEXT: vpxord %zmm4, %zmm4, %zmm4 -; ALL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 -; ALL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2 +; ALL-NEXT: vpxord %zmm4, %zmm4, %zmm4 # [?:0.000000e+00] +; ALL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # [?:0.000000e+00] +; ALL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2 # [?:0.000000e+00] ; ALL-NEXT: kmovw %k2, %eax -; ALL-NEXT: popcntl %eax, %eax +; ALL-NEXT: popcntl %eax, %eax # [3:1.00] ; ALL-NEXT: vexpandps (%rdi,%rax,4), %zmm1 {%k1} ; ALL-NEXT: vexpandps (%rdi), %zmm0 {%k2} -; ALL-NEXT: retq +; ALL-NEXT: retq # [1:1.00] %mask = icmp eq <32 x i32> %trigger, zeroinitializer %res = call <32 x float> @llvm.masked.expandload.v32f32(float* %base, <32 x i1> %mask, <32 x float> %src0) ret <32 x float> %res @@ -339,27 +345,27 @@ ; SKX-LABEL: test16: ; SKX: # BB#0: ; SKX-NEXT: vextracti32x8 $1, %zmm2, %ymm3 -; SKX-NEXT: vpxor %ymm4, %ymm4, %ymm4 -; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 -; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k2 +; SKX-NEXT: vpxor %ymm4, %ymm4, %ymm4 # [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # [?:0.000000e+00] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k2 # [?:0.000000e+00] ; SKX-NEXT: kmovb %k2, %eax -; SKX-NEXT: popcntl %eax, %eax +; SKX-NEXT: popcntl %eax, %eax # [3:1.00] ; SKX-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1} ; SKX-NEXT: vexpandpd (%rdi), %zmm0 {%k2} -; SKX-NEXT: retq +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test16: ; KNL: # BB#0: -; KNL-NEXT: vpxor %ymm3, %ymm3, %ymm3 +; KNL-NEXT: vpxor %ymm3, %ymm3, %ymm3 # [1:0.33] ; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm4 -; KNL-NEXT: vpcmpeqd %zmm3, %zmm4, %k1 -; KNL-NEXT: vpcmpeqd %zmm3, %zmm2, %k2 +; KNL-NEXT: vpcmpeqd %zmm3, %zmm4, %k1 # [?:0.000000e+00] +; KNL-NEXT: vpcmpeqd %zmm3, %zmm2, %k2 # [?:0.000000e+00] ; KNL-NEXT: vexpandpd (%rdi), %zmm0 {%k2} ; KNL-NEXT: kmovw %k2, %eax -; KNL-NEXT: movzbl %al, %eax -; KNL-NEXT: popcntl %eax, %eax +; KNL-NEXT: movzbl %al, %eax # [1:0.25] +; KNL-NEXT: popcntl %eax, %eax # [3:1.00] ; KNL-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] %mask = icmp eq <16 x i32> %trigger, zeroinitializer %res = call <16 x double> @llvm.masked.expandload.v16f64(double* %base, <16 x i1> %mask, <16 x double> %src0) ret <16 x double> %res @@ -368,26 +374,26 @@ define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) { ; SKX-LABEL: test17: ; SKX: # BB#0: -; SKX-NEXT: vpxord %zmm4, %zmm4, %zmm4 -; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 -; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k2 +; SKX-NEXT: vpxord %zmm4, %zmm4, %zmm4 # [?:0.000000e+00] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # [?:0.000000e+00] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k2 # [?:0.000000e+00] ; SKX-NEXT: kmovw %k2, %eax -; SKX-NEXT: popcntl %eax, %eax +; SKX-NEXT: popcntl %eax, %eax # [3:1.00] ; SKX-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1} ; SKX-NEXT: vcompressps %zmm0, (%rdi) {%k2} -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper # [1:0.00] +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test17: ; KNL: # BB#0: -; KNL-NEXT: vpxord %zmm4, %zmm4, %zmm4 -; KNL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 -; KNL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2 +; KNL-NEXT: vpxord %zmm4, %zmm4, %zmm4 # [?:0.000000e+00] +; KNL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # [?:0.000000e+00] +; KNL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2 # [?:0.000000e+00] ; KNL-NEXT: kmovw %k2, %eax -; KNL-NEXT: popcntl %eax, %eax +; KNL-NEXT: popcntl %eax, %eax # [3:1.00] ; KNL-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1} ; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k2} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] %mask = icmp eq <32 x i32> %trigger, zeroinitializer call void @llvm.masked.compressstore.v32f32(<32 x float> %V, float* %base, <32 x i1> %mask) ret void @@ -396,15 +402,15 @@ define void @test18(double* %base, <16 x double> %V, <16 x i1> %mask) { ; SKX-LABEL: test18: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 +; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 # [1:1.00] ; SKX-NEXT: vpmovb2m %xmm2, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: kmovb %k1, %eax -; SKX-NEXT: popcntl %eax, %eax +; SKX-NEXT: popcntl %eax, %eax # [3:1.00] ; SKX-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} ; SKX-NEXT: vcompresspd %zmm0, (%rdi) {%k1} -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper # [1:0.00] +; SKX-NEXT: retq # [1:1.00] ; ; KNL-LABEL: test18: ; KNL: # BB#0: @@ -413,11 +419,11 @@ ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL-NEXT: kshiftrw $8, %k1, %k2 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: movzbl %al, %eax -; KNL-NEXT: popcntl %eax, %eax +; KNL-NEXT: movzbl %al, %eax # [1:0.25] +; KNL-NEXT: popcntl %eax, %eax # [3:1.00] ; KNL-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} ; KNL-NEXT: vcompresspd %zmm0, (%rdi) {%k1} -; KNL-NEXT: retq +; KNL-NEXT: retq # [1:1.00] call void @llvm.masked.compressstore.v16f64(<16 x double> %V, double* %base, <16 x i1> %mask) ret void } Index: test/CodeGen/X86/fma.ll =================================================================== --- test/CodeGen/X86/fma.ll +++ test/CodeGen/X86/fma.ll @@ -12,10 +12,8 @@ ; FMA32-LABEL: test_f32: ; FMA32: ## BB#0: ## %entry ; FMA32-NEXT: pushl %eax ## encoding: [0x50] -; FMA32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08] -; FMA32-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMA32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c] -; FMA32-NEXT: ## xmm1 = mem[0],zero,zero,zero +; FMA32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]xmm0 = mem[0],zero,zero,zero +; FMA32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]xmm1 = mem[0],zero,zero,zero ; FMA32-NEXT: vfmadd213ss {{[0-9]+}}(%esp), %xmm0, %xmm1 ## encoding: [0xc4,0xe2,0x79,0xa9,0x4c,0x24,0x10] ; FMA32-NEXT: vmovss %xmm1, (%esp) ## encoding: [0xc5,0xfa,0x11,0x0c,0x24] ; FMA32-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] @@ -57,10 +55,8 @@ ; FMA32-LABEL: test_f64: ; FMA32: ## BB#0: ## %entry ; FMA32-NEXT: subl $12, %esp ## encoding: [0x83,0xec,0x0c] -; FMA32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x10] -; FMA32-NEXT: ## xmm0 = mem[0],zero -; FMA32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x18] -; FMA32-NEXT: ## xmm1 = mem[0],zero +; FMA32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x10]xmm0 = mem[0],zero +; FMA32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x18]xmm1 = mem[0],zero ; FMA32-NEXT: vfmadd213sd {{[0-9]+}}(%esp), %xmm0, %xmm1 ## encoding: [0xc4,0xe2,0xf9,0xa9,0x4c,0x24,0x20] ; FMA32-NEXT: vmovsd %xmm1, (%esp) ## encoding: [0xc5,0xfb,0x11,0x0c,0x24] ; FMA32-NEXT: fldl (%esp) ## encoding: [0xdd,0x04,0x24] @@ -238,11 +234,138 @@ ; FMA32-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0xc2] ; FMA32-NEXT: retl ## encoding: [0xc3] ; +; FMACALL32-LABEL: test_v4f32: +; FMACALL32: ## BB#0: ## %entry +; FMACALL32-NEXT: subl $108, %esp ## encoding: [0x83,0xec,0x6c] +; FMACALL32-NEXT: vmovaps %xmm2, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x40] +; FMACALL32-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x30] +; FMACALL32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20] +; FMACALL32-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x08,0x02] +; FMACALL32-NEXT: vextractps $2, %xmm1, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x4c,0x24,0x04,0x02] +; FMACALL32-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20] +; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x54] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20] +; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20] +; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x54] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] +; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x1c]xmm0 = mem[0],zero,zero,zero +; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10]xmm0 = xmm0[0],mem[0],xmm0[2,3] +; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20]xmm0 = xmm0[0,1],mem[0],xmm0[3] +; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30]xmm0 = xmm0[0,1,2],mem[0] +; FMACALL32-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] +; FMACALL32-NEXT: retl ## encoding: [0xc3] +; ; FMA64-LABEL: test_v4f32: ; FMA64: ## BB#0: ## %entry ; FMA64-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0xc2] ; FMA64-NEXT: retq ## encoding: [0xc3] ; +; FMACALL64-LABEL: test_v4f32: +; FMACALL64: ## BB#0: ## %entry +; FMACALL64-NEXT: subq $88, %rsp ## encoding: [0x48,0x83,0xec,0x58] +; FMACALL64-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x30] +; FMACALL64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x10] +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] +; FMACALL64-NEXT: shufps $231, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe7]xmm0 = xmm0[3,1,2,3] +; FMACALL64-NEXT: shufps $231, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe7]xmm1 = xmm1[3,1,2,3] +; FMACALL64-NEXT: shufps $231, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe7]xmm2 = xmm2[3,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] +; FMACALL64-NEXT: shufps $229, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe5]xmm0 = xmm0[1,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: shufps $229, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe5]xmm1 = xmm1[1,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x30] +; FMACALL64-NEXT: shufps $229, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe5]xmm2 = xmm2[1,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x04,0x24]xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x30] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x40] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x30] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8]xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: unpcklps (%rsp), %xmm1 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x0c,0x24]xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; FMACALL64-NEXT: movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1] +; FMACALL64-NEXT: addq $88, %rsp ## encoding: [0x48,0x83,0xc4,0x58] +; FMACALL64-NEXT: retq ## encoding: [0xc3] +; ; AVX512-LABEL: test_v4f32: ; AVX512: ## BB#0: ## %entry ; AVX512-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0xc2] @@ -263,11 +386,286 @@ ; FMA32-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0xa8,0xc2] ; FMA32-NEXT: retl ## encoding: [0xc3] ; +; FMACALL32-LABEL: test_v8f32: +; FMACALL32: ## BB#0: ## %entry +; FMACALL32-NEXT: subl $316, %esp ## encoding: [0x81,0xec,0x3c,0x01,0x00,0x00]imm = 0x13C +; FMACALL32-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0x00,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01] +; FMACALL32-NEXT: vmovaps %xmm3, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x60] +; FMACALL32-NEXT: vextractps $2, %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x5c,0x24,0x08,0x02] +; FMACALL32-NEXT: vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01] +; FMACALL32-NEXT: vmovaps %xmm2, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50] +; FMACALL32-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] +; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] +; FMACALL32-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40] +; FMACALL32-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xb4,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa8,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x9c,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x90,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x84,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x78] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x78] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x84,0x00,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x90,0x00,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x9c,0x00,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xa8,0x00,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xb4,0x00,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] +; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c]xmm0 = mem[0],zero,zero,zero +; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10]xmm0 = xmm0[0],mem[0],xmm0[2,3] +; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20]xmm0 = xmm0[0,1],mem[0],xmm0[3] +; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30]xmm0 = xmm0[0,1,2],mem[0] +; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c]xmm1 = mem[0],zero,zero,zero +; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10]xmm1 = xmm1[0],mem[0],xmm1[2,3] +; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20]xmm1 = xmm1[0,1],mem[0],xmm1[3] +; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30]xmm1 = xmm1[0,1,2],mem[0] +; FMACALL32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; FMACALL32-NEXT: addl $316, %esp ## encoding: [0x81,0xc4,0x3c,0x01,0x00,0x00]imm = 0x13C +; FMACALL32-NEXT: retl ## encoding: [0xc3] +; ; FMA64-LABEL: test_v8f32: ; FMA64: ## BB#0: ## %entry ; FMA64-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0xa8,0xc2] ; FMA64-NEXT: retq ## encoding: [0xc3] ; +; FMACALL64-LABEL: test_v8f32: +; FMACALL64: ## BB#0: ## %entry +; FMACALL64-NEXT: subq $136, %rsp ## encoding: [0x48,0x81,0xec,0x88,0x00,0x00,0x00] +; FMACALL64-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x6c,0x24,0x50] +; FMACALL64-NEXT: movaps %xmm4, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x24,0x24] +; FMACALL64-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x5c,0x24,0x40] +; FMACALL64-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x60] +; FMACALL64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x30] +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] +; FMACALL64-NEXT: shufps $231, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe7]xmm0 = xmm0[3,1,2,3] +; FMACALL64-NEXT: movaps %xmm2, %xmm1 ## encoding: [0x0f,0x28,0xca] +; FMACALL64-NEXT: shufps $231, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe7]xmm1 = xmm1[3,1,2,3] +; FMACALL64-NEXT: movaps %xmm4, %xmm2 ## encoding: [0x0f,0x28,0xd4] +; FMACALL64-NEXT: shufps $231, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe7]xmm2 = xmm2[3,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] +; FMACALL64-NEXT: shufps $229, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe5]xmm0 = xmm0[1,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x60] +; FMACALL64-NEXT: shufps $229, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe5]xmm1 = xmm1[1,1,2,3] +; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] +; FMACALL64-NEXT: shufps $229, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe5]xmm2 = xmm2[1,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x44,0x24,0x20]xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x70] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x60] +; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x60] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8]xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x4c,0x24,0x70]xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; FMACALL64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x20] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: shufps $231, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe7]xmm0 = xmm0[3,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] +; FMACALL64-NEXT: shufps $231, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe7]xmm1 = xmm1[3,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x50] +; FMACALL64-NEXT: shufps $231, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe7]xmm2 = xmm2[3,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: shufps $229, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe5]xmm0 = xmm0[1,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] +; FMACALL64-NEXT: shufps $229, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe5]xmm1 = xmm1[1,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x50] +; FMACALL64-NEXT: shufps $229, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe5]xmm2 = xmm2[1,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x04,0x24]xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x50] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x50] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8]xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: unpcklps (%rsp), %xmm1 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x0c,0x24]xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] +; FMACALL64-NEXT: addq $136, %rsp ## encoding: [0x48,0x81,0xc4,0x88,0x00,0x00,0x00] +; FMACALL64-NEXT: retq ## encoding: [0xc3] +; ; AVX512-LABEL: test_v8f32: ; AVX512: ## BB#0: ## %entry ; AVX512-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0xa8,0xc2] @@ -295,12 +693,542 @@ ; FMA32-NEXT: popl %ebp ## encoding: [0x5d] ; FMA32-NEXT: retl ## encoding: [0xc3] ; +; FMACALL32-LABEL: test_v16f32: +; FMACALL32: ## BB#0: ## %entry +; FMACALL32-NEXT: pushl %ebp ## encoding: [0x55] +; FMACALL32-NEXT: movl %esp, %ebp ## encoding: [0x89,0xe5] +; FMACALL32-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0] +; FMACALL32-NEXT: subl $448, %esp ## encoding: [0x81,0xec,0xc0,0x01,0x00,0x00]imm = 0x1C0 +; FMACALL32-NEXT: vmovaps %ymm3, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60] +; FMACALL32-NEXT: vmovaps %ymm2, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] +; FMACALL32-NEXT: vextractf128 $1, %ymm3, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd8,0x01] +; FMACALL32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] +; FMACALL32-NEXT: vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01] +; FMACALL32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x54,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x48,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x3c,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x30,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x24,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x18,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50] +; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x00,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xf4,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xe8,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] +; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x50] +; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x50] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xe8,0x00,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xf4,0x00,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x00,0x01,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x0c,0x01,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x18,0x01,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x24,0x01,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x30,0x01,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x3c,0x01,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x48,0x01,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x54,0x01,0x00,0x00] +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] +; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] +; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c]xmm0 = mem[0],zero,zero,zero +; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10]xmm0 = xmm0[0],mem[0],xmm0[2,3] +; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20]xmm0 = xmm0[0,1],mem[0],xmm0[3] +; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x20,0x30]xmm0 = xmm0[0,1,2],mem[0] +; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c]xmm1 = mem[0],zero,zero,zero +; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10]xmm1 = xmm1[0],mem[0],xmm1[2,3] +; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x14,0x20]xmm1 = xmm1[0,1],mem[0],xmm1[3] +; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x10,0x30]xmm1 = xmm1[0,1,2],mem[0] +; FMACALL32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x4c]xmm1 = mem[0],zero,zero,zero +; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x48,0x10]xmm1 = xmm1[0],mem[0],xmm1[2,3] +; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x44,0x20]xmm1 = xmm1[0,1],mem[0],xmm1[3] +; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x40,0x30]xmm1 = xmm1[0,1,2],mem[0] +; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x3c]xmm2 = mem[0],zero,zero,zero +; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x38,0x10]xmm2 = xmm2[0],mem[0],xmm2[2,3] +; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x34,0x20]xmm2 = xmm2[0,1],mem[0],xmm2[3] +; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x30,0x30]xmm2 = xmm2[0,1,2],mem[0] +; FMACALL32-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] +; FMACALL32-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] +; FMACALL32-NEXT: popl %ebp ## encoding: [0x5d] +; FMACALL32-NEXT: retl ## encoding: [0xc3] +; ; FMA64-LABEL: test_v16f32: ; FMA64: ## BB#0: ## %entry ; FMA64-NEXT: vfmadd213ps %ymm4, %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0x6d,0xa8,0xc4] ; FMA64-NEXT: vfmadd213ps %ymm5, %ymm3, %ymm1 ## encoding: [0xc4,0xe2,0x65,0xa8,0xcd] ; FMA64-NEXT: retq ## encoding: [0xc3] ; +; FMACALL64-LABEL: test_v16f32: +; FMACALL64: ## BB#0: ## %entry +; FMACALL64-NEXT: subq $168, %rsp ## encoding: [0x48,0x81,0xec,0xa8,0x00,0x00,0x00] +; FMACALL64-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0xbc,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x74,0x24,0x20] +; FMACALL64-NEXT: movaps %xmm5, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x2c,0x24] +; FMACALL64-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x64,0x24,0x10] +; FMACALL64-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x5c,0x24,0x70] +; FMACALL64-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x40] +; FMACALL64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x30] +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x50] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] +; FMACALL64-NEXT: shufps $231, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe7]xmm0 = xmm0[3,1,2,3] +; FMACALL64-NEXT: movaps %xmm4, %xmm1 ## encoding: [0x0f,0x28,0xcc] +; FMACALL64-NEXT: shufps $231, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe7]xmm1 = xmm1[3,1,2,3] +; FMACALL64-NEXT: shufps $231, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe7]xmm2 = xmm2[3,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x60] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] +; FMACALL64-NEXT: shufps $229, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe5]xmm0 = xmm0[1,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: shufps $229, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe5]xmm1 = xmm1[1,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] +; FMACALL64-NEXT: shufps $229, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe5]xmm2 = xmm2[1,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x44,0x24,0x60]xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x84,0x24,0x90,0x00,0x00,0x00] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x60] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x60] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8]xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x8c,0x24,0x90,0x00,0x00,0x00]xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; FMACALL64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x60] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: shufps $231, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe7]xmm0 = xmm0[3,1,2,3] +; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: shufps $231, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe7]xmm1 = xmm1[3,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] +; FMACALL64-NEXT: shufps $231, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe7]xmm2 = xmm2[3,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: shufps $229, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe5]xmm0 = xmm0[1,1,2,3] +; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: shufps $229, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe5]xmm1 = xmm1[1,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] +; FMACALL64-NEXT: shufps $229, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe5]xmm2 = xmm2[1,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x44,0x24,0x10]xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x50] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8]xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x4c,0x24,0x50]xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; FMACALL64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x10] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x40] +; FMACALL64-NEXT: shufps $231, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe7]xmm0 = xmm0[3,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] +; FMACALL64-NEXT: shufps $231, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe7]xmm1 = xmm1[3,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xd0,0x00,0x00,0x00] +; FMACALL64-NEXT: shufps $231, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe7]xmm2 = xmm2[3,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x40] +; FMACALL64-NEXT: shufps $229, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe5]xmm0 = xmm0[1,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] +; FMACALL64-NEXT: shufps $229, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe5]xmm1 = xmm1[1,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xd0,0x00,0x00,0x00] +; FMACALL64-NEXT: shufps $229, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe5]xmm2 = xmm2[1,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x04,0x24]xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x30] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x40] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xd0,0x00,0x00,0x00] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x40] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xd0,0x00,0x00,0x00] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8]xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x4c,0x24,0x30]xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; FMACALL64-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x0c,0x24] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x70] +; FMACALL64-NEXT: shufps $231, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe7]xmm0 = xmm0[3,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: shufps $231, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe7]xmm1 = xmm1[3,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xe0,0x00,0x00,0x00] +; FMACALL64-NEXT: shufps $231, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe7]xmm2 = xmm2[3,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x70] +; FMACALL64-NEXT: shufps $229, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe5]xmm0 = xmm0[1,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: shufps $229, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xe5]xmm1 = xmm1[1,1,2,3] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xe0,0x00,0x00,0x00] +; FMACALL64-NEXT: shufps $229, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xe5]xmm2 = xmm2[1,1,2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x44,0x24,0x20]xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x70] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xe0,0x00,0x00,0x00] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x40] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x70] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xe0,0x00,0x00,0x00] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8]xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Folded Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x4c,0x24,0x20]xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; FMACALL64-NEXT: movaps %xmm1, %xmm3 ## encoding: [0x0f,0x28,0xd9] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x60] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] +; FMACALL64-NEXT: addq $168, %rsp ## encoding: [0x48,0x81,0xc4,0xa8,0x00,0x00,0x00] +; FMACALL64-NEXT: retq ## encoding: [0xc3] +; ; AVX512-LABEL: test_v16f32: ; AVX512: ## BB#0: ## %entry ; AVX512-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2] @@ -321,11 +1249,74 @@ ; FMA32-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0xc2] ; FMA32-NEXT: retl ## encoding: [0xc3] ; +; FMACALL32-LABEL: test_v2f64: +; FMACALL32: ## BB#0: ## %entry +; FMACALL32-NEXT: subl $108, %esp ## encoding: [0x83,0xec,0x6c] +; FMACALL32-NEXT: vmovaps %xmm2, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x30] +; FMACALL32-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40] +; FMACALL32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50] +; FMACALL32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x54,0x24,0x10] +; FMACALL32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x4c,0x24,0x08] +; FMACALL32-NEXT: vmovlps %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x13,0x04,0x24] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: vmovapd {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x30] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32-NEXT: vmovapd {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x40] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovapd {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x50] +; FMACALL32-NEXT: vmovhpd %xmm0, (%esp) ## encoding: [0xc5,0xf9,0x17,0x04,0x24] +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] +; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28]xmm0 = mem[0],zero +; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x20]xmm0 = xmm0[0],mem[0] +; FMACALL32-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] +; FMACALL32-NEXT: retl ## encoding: [0xc3] +; ; FMA64-LABEL: test_v2f64: ; FMA64: ## BB#0: ## %entry ; FMA64-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0xc2] ; FMA64-NEXT: retq ## encoding: [0xc3] ; +; FMACALL64-LABEL: test_v2f64: +; FMACALL64: ## BB#0: ## %entry +; FMACALL64-NEXT: subq $72, %rsp ## encoding: [0x48,0x83,0xec,0x48] +; FMACALL64-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x20] +; FMACALL64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x10] +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x30] +; FMACALL64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x04,0x24] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x20] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x28,0x4c,0x24,0x30] +; FMACALL64-NEXT: unpcklpd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x14,0xc8]xmm1 = xmm1[0],xmm0[0] +; FMACALL64-NEXT: movapd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x28,0xc1] +; FMACALL64-NEXT: addq $72, %rsp ## encoding: [0x48,0x83,0xc4,0x48] +; FMACALL64-NEXT: retq ## encoding: [0xc3] +; ; AVX512-LABEL: test_v2f64: ; AVX512: ## BB#0: ## %entry ; AVX512-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0xc2] @@ -346,11 +1337,156 @@ ; FMA32-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0xc2] ; FMA32-NEXT: retl ## encoding: [0xc3] ; +; FMACALL32-LABEL: test_v4f64: +; FMACALL32: ## BB#0: ## %entry +; FMACALL32-NEXT: subl $252, %esp ## encoding: [0x81,0xec,0xfc,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01] +; FMACALL32-NEXT: vmovaps %xmm3, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x70] +; FMACALL32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x5c,0x24,0x10] +; FMACALL32-NEXT: vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01] +; FMACALL32-NEXT: vmovaps %xmm2, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x60] +; FMACALL32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x54,0x24,0x08] +; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] +; FMACALL32-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x50] +; FMACALL32-NEXT: vmovlps %xmm1, (%esp) ## encoding: [0xc5,0xf8,0x13,0x0c,0x24] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x44] +; FMACALL32-NEXT: vmovupd {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfd,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32-NEXT: vmovupd {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfd,0x10,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovupd {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfd,0x10,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovhpd %xmm0, (%esp) ## encoding: [0xc5,0xf9,0x17,0x04,0x24] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x38] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovlps %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x13,0x04,0x24] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: vmovapd {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x70] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32-NEXT: vmovapd {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x60] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovapd {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x50] +; FMACALL32-NEXT: vmovhpd %xmm0, (%esp) ## encoding: [0xc5,0xf9,0x17,0x04,0x24] +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x38] +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x44] +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18] +; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x30]xmm0 = mem[0],zero +; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x28]xmm0 = xmm0[0],mem[0] +; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20]xmm1 = mem[0],zero +; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x18]xmm1 = xmm1[0],mem[0] +; FMACALL32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; FMACALL32-NEXT: addl $252, %esp ## encoding: [0x81,0xc4,0xfc,0x00,0x00,0x00] +; FMACALL32-NEXT: retl ## encoding: [0xc3] +; ; FMA64-LABEL: test_v4f64: ; FMA64: ## BB#0: ## %entry ; FMA64-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0xc2] ; FMA64-NEXT: retq ## encoding: [0xc3] ; +; FMACALL64-LABEL: test_v4f64: +; FMACALL64: ## BB#0: ## %entry +; FMACALL64-NEXT: subq $120, %rsp ## encoding: [0x48,0x83,0xec,0x78] +; FMACALL64-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x6c,0x24,0x40] +; FMACALL64-NEXT: movaps %xmm4, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x24,0x24] +; FMACALL64-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x5c,0x24,0x30] +; FMACALL64-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x60] +; FMACALL64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x20] +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x50] +; FMACALL64-NEXT: movaps %xmm2, %xmm1 ## encoding: [0x0f,0x28,0xca] +; FMACALL64-NEXT: movaps %xmm4, %xmm2 ## encoding: [0x0f,0x28,0xd4] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x60] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: unpcklpd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x14,0xc8]xmm1 = xmm1[0],xmm0[0] +; FMACALL64-NEXT: movapd %xmm1, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x29,0x4c,0x24,0x10] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x40] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x40] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movapd (%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: unpcklpd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x14,0xc8]xmm1 = xmm1[0],xmm0[0] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] +; FMACALL64-NEXT: addq $120, %rsp ## encoding: [0x48,0x83,0xc4,0x78] +; FMACALL64-NEXT: retq ## encoding: [0xc3] +; ; AVX512-LABEL: test_v4f64: ; AVX512: ## BB#0: ## %entry ; AVX512-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0xc2] @@ -378,12 +1514,293 @@ ; FMA32-NEXT: popl %ebp ## encoding: [0x5d] ; FMA32-NEXT: retl ## encoding: [0xc3] ; +; FMACALL32-LABEL: test_v8f64: +; FMACALL32: ## BB#0: ## %entry +; FMACALL32-NEXT: pushl %ebp ## encoding: [0x55] +; FMACALL32-NEXT: movl %esp, %ebp ## encoding: [0x89,0xe5] +; FMACALL32-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0] +; FMACALL32-NEXT: subl $384, %esp ## encoding: [0x81,0xec,0x80,0x01,0x00,0x00]imm = 0x180 +; FMACALL32-NEXT: vmovaps %ymm3, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovaps %ymm2, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) ## 32-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32-NEXT: vextractf128 $1, %ymm3, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd8,0x01] +; FMACALL32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x50,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x08] +; FMACALL32-NEXT: vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01] +; FMACALL32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x40,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovlps %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x13,0x04,0x24] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovapd 40(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x28] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32-NEXT: vmovapd {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfd,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovapd {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfd,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovhpd %xmm0, (%esp) ## encoding: [0xc5,0xf9,0x17,0x04,0x24] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovlps %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x13,0x04,0x24] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32-NEXT: vmovapd %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x29,0x44,0x24,0x30] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32-NEXT: vmovapd %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x29,0x44,0x24,0x20] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32-NEXT: vmovapd %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x29,0x84,0x24,0x30,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovhpd %xmm0, (%esp) ## encoding: [0xc5,0xf9,0x17,0x04,0x24] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] +; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20] +; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x30,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovlps %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x13,0x04,0x24] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x30] +; FMACALL32-NEXT: vmovapd 8(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x08] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32-NEXT: vmovapd {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfd,0x28,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovapd {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfd,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovhpd %xmm0, (%esp) ## encoding: [0xc5,0xf9,0x17,0x04,0x24] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill [1:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x20] +; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0 ## 32-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32-NEXT: vmovlps %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x13,0x04,0x24] +; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: vmovapd {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32-NEXT: vmovapd {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x84,0x24,0x50,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x08] +; FMACALL32-NEXT: vmovapd {{[0-9]+}}(%esp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x84,0x24,0x40,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovhpd %xmm0, (%esp) ## encoding: [0xc5,0xf9,0x17,0x04,0x24] +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x20] +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x30] +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x88,0x00,0x00,0x00] +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78] +; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload [4:?] +; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x94,0x00,0x00,0x00] +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x70] +; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68] +; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x60]xmm0 = mem[0],zero +; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x58]xmm0 = xmm0[0],mem[0] +; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50]xmm1 = mem[0],zero +; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x48]xmm1 = xmm1[0],mem[0] +; FMACALL32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00]xmm1 = mem[0],zero +; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x78]xmm1 = xmm1[0],mem[0] +; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70]xmm2 = mem[0],zero +; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0x16,0x54,0x24,0x68]xmm2 = xmm2[0],mem[0] +; FMACALL32-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] +; FMACALL32-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] +; FMACALL32-NEXT: popl %ebp ## encoding: [0x5d] +; FMACALL32-NEXT: retl ## encoding: [0xc3] +; ; FMA64-LABEL: test_v8f64: ; FMA64: ## BB#0: ## %entry ; FMA64-NEXT: vfmadd213pd %ymm4, %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0xed,0xa8,0xc4] ; FMA64-NEXT: vfmadd213pd %ymm5, %ymm3, %ymm1 ## encoding: [0xc4,0xe2,0xe5,0xa8,0xcd] ; FMA64-NEXT: retq ## encoding: [0xc3] ; +; FMACALL64-LABEL: test_v8f64: +; FMACALL64: ## BB#0: ## %entry +; FMACALL64-NEXT: subq $152, %rsp ## encoding: [0x48,0x81,0xec,0x98,0x00,0x00,0x00] +; FMACALL64-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x7c,0x24,0x70] +; FMACALL64-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x74,0x24,0x20] +; FMACALL64-NEXT: movaps %xmm5, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x2c,0x24] +; FMACALL64-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x64,0x24,0x10] +; FMACALL64-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x5c,0x24,0x60] +; FMACALL64-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x50] +; FMACALL64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x40] +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xa0,0x00,0x00,0x00] +; FMACALL64-NEXT: movaps %xmm4, %xmm1 ## encoding: [0x0f,0x28,0xcc] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x30] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xa0,0x00,0x00,0x00] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x28,0x4c,0x24,0x30] +; FMACALL64-NEXT: unpcklpd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x14,0xc8]xmm1 = xmm1[0],xmm0[0] +; FMACALL64-NEXT: movapd %xmm1, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x29,0x4c,0x24,0x30] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x40] +; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x40] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: unpcklpd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x14,0xc8]xmm1 = xmm1[0],xmm0[0] +; FMACALL64-NEXT: movapd %xmm1, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x29,0x4c,0x24,0x10] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movapd (%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: unpcklpd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x14,0xc8]xmm1 = xmm1[0],xmm0[0] +; FMACALL64-NEXT: movapd %xmm1, (%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x29,0x0c,0x24] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x60] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x70] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xd0,0x00,0x00,0x00] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill [1:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x60] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0]xmm0 = xmm0[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x70] +; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9]xmm1 = xmm1[1,1] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xd0,0x00,0x00,0x00] +; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2]xmm2 = xmm2[1,1] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL64-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x28,0x5c,0x24,0x20] +; FMACALL64-NEXT: unpcklpd %xmm0, %xmm3 ## encoding: [0x66,0x0f,0x14,0xd8]xmm3 = xmm3[0],xmm0[0] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload [4:?] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] +; FMACALL64-NEXT: addq $152, %rsp ## encoding: [0x48,0x81,0xc4,0x98,0x00,0x00,0x00] +; FMACALL64-NEXT: retq ## encoding: [0xc3] +; ; AVX512-LABEL: test_v8f64: ; AVX512: ## BB#0: ## %entry ; AVX512-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2] Index: test/CodeGen/X86/fp128-i128.ll =================================================================== --- test/CodeGen/X86/fp128-i128.ll +++ test/CodeGen/X86/fp128-i128.ll @@ -44,16 +44,18 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 { ; CHECK-LABEL: TestUnionLD1: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax # [4:0.50] ; CHECK-NEXT: movabsq $281474976710655, %rcx # imm = 0xFFFFFFFFFFFF -; CHECK-NEXT: andq %rdi, %rcx +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: andq %rdi, %rcx # [1:0.33] ; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 -; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx # [5:0.50] +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: orq %rcx, %rdx # [1:0.33] +; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 # [4:0.50] ; CHECK-NEXT: jmp foo # TAILCALL entry: %0 = bitcast fp128 %s to i128 @@ -79,12 +81,12 @@ define fp128 @TestUnionLD2(fp128 %s) #0 { ; CHECK-LABEL: TestUnionLD2: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax # [4:0.50] +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 # [4:0.50] +; CHECK-NEXT: retq # [5:1.00] entry: %0 = bitcast fp128 %s to i128 %bf.clear = and i128 %0, -18446744073709551616 @@ -103,23 +105,24 @@ define fp128 @TestI128_1(fp128 %x) #0 { ; CHECK-LABEL: TestI128_1: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: subq $40, %rsp # [1:0.33] +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax # [4:0.50] ; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: andq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rax, (%rsp) -; CHECK-NEXT: movaps (%rsp), %xmm0 -; CHECK-NEXT: movaps {{.*}}(%rip), %xmm1 +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: andq {{[0-9]+}}(%rsp), %rcx # [5:0.50] +; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movq %rax, (%rsp) # [1:1.00] +; CHECK-NEXT: movaps (%rsp), %xmm0 # [4:0.50] +; CHECK-NEXT: movaps {{.*}}(%rip), %xmm1 # [4:0.50] ; CHECK-NEXT: callq __lttf2 -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: sets %cl -; CHECK-NEXT: shlq $4, %rcx -; CHECK-NEXT: movaps {{\.LCPI.*}}(%rcx), %xmm0 -; CHECK-NEXT: addq $40, %rsp -; CHECK-NEXT: retq +; CHECK-NEXT: xorl %ecx, %ecx # [1:0.33] +; CHECK-NEXT: testl %eax, %eax # [1:0.33] +; CHECK-NEXT: sets %cl # [1:0.33] +; CHECK-NEXT: shlq $4, %rcx # [1:0.50] +; CHECK-NEXT: movaps {{\.LCPI.*}}(%rcx), %xmm0 # [4:0.50] +; CHECK-NEXT: addq $40, %rsp # [1:0.33] +; CHECK-NEXT: retq # [5:1.00] entry: %0 = bitcast fp128 %x to i128 %bf.clear = and i128 %0, 170141183460469231731687303715884105727 @@ -141,13 +144,13 @@ define fp128 @TestI128_2(fp128 %x, fp128 %y) #0 { ; CHECK-LABEL: TestI128_2: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: jns .LBB3_2 +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) # [5:1.00] +; CHECK-NEXT: jns .LBB3_2 # [1:1.00] ; CHECK-NEXT: # BB#1: # %entry -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movaps %xmm1, %xmm0 # [1:1.00] ; CHECK-NEXT: .LBB3_2: # %entry -; CHECK-NEXT: retq +; CHECK-NEXT: retq # [5:1.00] entry: %0 = bitcast fp128 %x to i128 %cmp = icmp sgt i128 %0, -1 @@ -169,30 +172,33 @@ define fp128 @TestI128_3(fp128 %x, i32* nocapture readnone %ex) #0 { ; CHECK-LABEL: TestI128_3: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: subq $56, %rsp # [1:0.33] +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax # [4:0.50] ; CHECK-NEXT: movabsq $9223090561878065152, %rcx # imm = 0x7FFF000000000000 -; CHECK-NEXT: testq %rcx, %rax -; CHECK-NEXT: je .LBB4_2 +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: testq %rcx, %rax # [1:0.33] +; CHECK-NEXT: je .LBB4_2 # [1:1.00] ; CHECK-NEXT: # BB#1: -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: jmp .LBB4_3 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx # [4:0.50] +; CHECK-NEXT: jmp .LBB4_3 # [1:1.00] ; CHECK-NEXT: .LBB4_2: # %if.then -; CHECK-NEXT: movaps {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movaps {{.*}}(%rip), %xmm1 # [4:0.50] ; CHECK-NEXT: callq __multf3 -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx # [4:0.50] ; CHECK-NEXT: movabsq $-9223090561878065153, %rdx # imm = 0x8000FFFFFFFFFFFF -; CHECK-NEXT: andq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: andq {{[0-9]+}}(%rsp), %rdx # [5:0.50] ; CHECK-NEXT: movabsq $4611123068473966592, %rax # imm = 0x3FFE000000000000 -; CHECK-NEXT: orq %rdx, %rax +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: orq %rdx, %rax # [1:0.33] ; CHECK-NEXT: .LBB4_3: # %if.end -; CHECK-NEXT: movq %rcx, (%rsp) -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps (%rsp), %xmm0 -; CHECK-NEXT: addq $56, %rsp -; CHECK-NEXT: retq +; CHECK-NEXT: movq %rcx, (%rsp) # [1:1.00] +; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movaps (%rsp), %xmm0 # [4:0.50] +; CHECK-NEXT: addq $56, %rsp # [1:0.33] +; CHECK-NEXT: retq # [5:1.00] entry: %0 = bitcast fp128 %x to i128 %bf.cast = and i128 %0, 170135991163610696904058773219554885632 @@ -225,16 +231,16 @@ define fp128 @TestI128_4(fp128 %x) #0 { ; CHECK-LABEL: TestI128_4: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: movaps (%rsp), %xmm0 +; CHECK-NEXT: subq $40, %rsp # [1:0.33] +; CHECK-NEXT: movaps %xmm0, %xmm1 # [1:1.00] +; CHECK-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax # [4:0.50] +; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movq $0, (%rsp) # [1:1.00] +; CHECK-NEXT: movaps (%rsp), %xmm0 # [4:0.50] ; CHECK-NEXT: callq __addtf3 -; CHECK-NEXT: addq $40, %rsp -; CHECK-NEXT: retq +; CHECK-NEXT: addq $40, %rsp # [1:0.33] +; CHECK-NEXT: retq # [5:1.00] entry: %0 = bitcast fp128 %x to i128 %bf.clear = and i128 %0, -18446744073709551616 @@ -273,16 +279,16 @@ define fp128 @acosl(fp128 %x) #0 { ; CHECK-LABEL: acosl: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: movaps (%rsp), %xmm0 +; CHECK-NEXT: subq $40, %rsp # [1:0.33] +; CHECK-NEXT: movaps %xmm0, %xmm1 # [1:1.00] +; CHECK-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax # [4:0.50] +; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: movq $0, (%rsp) # [1:1.00] +; CHECK-NEXT: movaps (%rsp), %xmm0 # [4:0.50] ; CHECK-NEXT: callq __addtf3 -; CHECK-NEXT: addq $40, %rsp -; CHECK-NEXT: retq +; CHECK-NEXT: addq $40, %rsp # [1:0.33] +; CHECK-NEXT: retq # [5:1.00] entry: %0 = bitcast fp128 %x to i128 %bf.clear = and i128 %0, -18446744073709551616 @@ -295,13 +301,13 @@ define fp128 @TestComp(fp128 %x, fp128 %y) #0 { ; CHECK-LABEL: TestComp: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: jns .LBB8_2 +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # [1:1.00] +; CHECK-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) # [5:1.00] +; CHECK-NEXT: jns .LBB8_2 # [1:1.00] ; CHECK-NEXT: # BB#1: # %entry -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movaps %xmm1, %xmm0 # [1:1.00] ; CHECK-NEXT: .LBB8_2: # %entry -; CHECK-NEXT: retq +; CHECK-NEXT: retq # [5:1.00] entry: %0 = bitcast fp128 %x to i128 %cmp = icmp sgt i128 %0, -1 @@ -315,8 +321,8 @@ define fp128 @TestFABS_LD(fp128 %x) #0 { ; CHECK-LABEL: TestFABS_LD: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 # [5:0.50] +; CHECK-NEXT: retq # [5:1.00] entry: %call = tail call fp128 @fabsl(fp128 %x) #2 ret fp128 %call @@ -330,41 +336,47 @@ define void @TestCopySign({ fp128, fp128 }* noalias nocapture sret %agg.result, { fp128, fp128 }* byval nocapture readonly align 16 %z) #0 { ; CHECK-LABEL: TestCopySign: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: pushq %rbp # [1:1.00] +; CHECK-NEXT: pushq %rbx # [1:1.00] +; CHECK-NEXT: subq $40, %rsp # [1:0.33] +; CHECK-NEXT: movq %rdi, %rbx # [1:0.33] +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # [4:0.50] +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # [4:0.50] +; CHECK-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill [1:1.00] +; CHECK-NEXT: # [1:1.00] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill [1:1.00] +; CHECK-NEXT: # [1:1.00] ; CHECK-NEXT: callq __gttf2 -; CHECK-NEXT: movl %eax, %ebp -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movl %eax, %ebp # [1:0.33] +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload [4:0.50] +; CHECK-NEXT: # [4:0.50] +; CHECK-NEXT: movaps %xmm0, %xmm1 # [1:1.00] ; CHECK-NEXT: callq __subtf3 -; CHECK-NEXT: testl %ebp, %ebp -; CHECK-NEXT: jle .LBB10_1 +; CHECK-NEXT: testl %ebp, %ebp # [1:0.33] +; CHECK-NEXT: jle .LBB10_1 # [1:1.00] ; CHECK-NEXT: # BB#2: # %if.then -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm1, %xmm2 -; CHECK-NEXT: jmp .LBB10_3 +; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 # [5:0.50] +; CHECK-NEXT: movaps %xmm0, %xmm1 # [1:1.00] +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload [4:0.50] +; CHECK-NEXT: # [4:0.50] +; CHECK-NEXT: movaps %xmm1, %xmm2 # [1:1.00] +; CHECK-NEXT: jmp .LBB10_3 # [1:1.00] ; CHECK-NEXT: .LBB10_1: -; CHECK-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload [4:0.50] +; CHECK-NEXT: # [4:0.50] ; CHECK-NEXT: .LBB10_3: # %cleanup -; CHECK-NEXT: movaps {{.*}}(%rip), %xmm1 -; CHECK-NEXT: andps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: orps %xmm1, %xmm0 -; CHECK-NEXT: movaps %xmm2, (%rbx) -; CHECK-NEXT: movaps %xmm0, 16(%rbx) -; CHECK-NEXT: movq %rbx, %rax -; CHECK-NEXT: addq $40, %rsp -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: retq +; CHECK-NEXT: movaps {{.*}}(%rip), %xmm1 # [4:0.50] +; CHECK-NEXT: andps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload [5:0.50] +; CHECK-NEXT: # [5:0.50] +; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 # [5:0.50] +; CHECK-NEXT: orps %xmm1, %xmm0 # [1:0.33] +; CHECK-NEXT: movaps %xmm2, (%rbx) # [1:1.00] +; CHECK-NEXT: movaps %xmm0, 16(%rbx) # [1:1.00] +; CHECK-NEXT: movq %rbx, %rax # [1:0.33] +; CHECK-NEXT: addq $40, %rsp # [1:0.33] +; CHECK-NEXT: popq %rbx # [4:0.50] +; CHECK-NEXT: popq %rbp # [4:0.50] +; CHECK-NEXT: retq # [5:1.00] entry: %z.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %z, i64 0, i32 0 %z.real = load fp128, fp128* %z.realp, align 16 Index: test/CodeGen/X86/illegal-bitfield-loadstore.ll =================================================================== --- test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -4,14 +4,15 @@ define void @i24_or(i24* %a) { ; CHECK-LABEL: i24_or: ; CHECK: # BB#0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: movb %cl, 2(%rdi) -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: movzwl (%rdi), %eax # [5:0.50] +; CHECK-NEXT: movzbl 2(%rdi), %ecx # [5:0.50] +; CHECK-NEXT: movb %cl, 2(%rdi) # [1:1.00] +; CHECK-NEXT: shll $16, %ecx # [1:0.50] +; CHECK-NEXT: orl %eax, %ecx # [1:0.33] ; CHECK-NEXT: orl $384, %ecx # imm = 0x180 -; CHECK-NEXT: movw %cx, (%rdi) -; CHECK-NEXT: retq +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: movw %cx, (%rdi) # [1:1.00] +; CHECK-NEXT: retq # [5:1.00] %aa = load i24, i24* %a, align 1 %b = or i24 %aa, 384 store i24 %b, i24* %a, align 1 @@ -21,15 +22,17 @@ define void @i24_and_or(i24* %a) { ; CHECK-LABEL: i24_and_or: ; CHECK: # BB#0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: movb %cl, 2(%rdi) -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: movzwl (%rdi), %eax # [5:0.50] +; CHECK-NEXT: movzbl 2(%rdi), %ecx # [5:0.50] +; CHECK-NEXT: movb %cl, 2(%rdi) # [1:1.00] +; CHECK-NEXT: shll $16, %ecx # [1:0.50] +; CHECK-NEXT: orl %eax, %ecx # [1:0.33] ; CHECK-NEXT: orl $384, %ecx # imm = 0x180 +; CHECK-NEXT: # [1:0.33] ; CHECK-NEXT: andl $16777088, %ecx # imm = 0xFFFF80 -; CHECK-NEXT: movw %cx, (%rdi) -; CHECK-NEXT: retq +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: movw %cx, (%rdi) # [1:1.00] +; CHECK-NEXT: retq # [5:1.00] %b = load i24, i24* %a, align 1 %c = and i24 %b, -128 %d = or i24 %c, 384 @@ -40,17 +43,18 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { ; CHECK-LABEL: i24_insert_bit: ; CHECK: # BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: movzwl (%rdi), %ecx -; CHECK-NEXT: movzbl 2(%rdi), %edx -; CHECK-NEXT: movb %dl, 2(%rdi) -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: shll $13, %eax +; CHECK-NEXT: movzbl %sil, %eax # [1:0.33] +; CHECK-NEXT: movzwl (%rdi), %ecx # [5:0.50] +; CHECK-NEXT: movzbl 2(%rdi), %edx # [5:0.50] +; CHECK-NEXT: movb %dl, 2(%rdi) # [1:1.00] +; CHECK-NEXT: shll $16, %edx # [1:0.50] +; CHECK-NEXT: orl %ecx, %edx # [1:0.33] +; CHECK-NEXT: shll $13, %eax # [1:0.50] ; CHECK-NEXT: andl $16769023, %edx # imm = 0xFFDFFF -; CHECK-NEXT: orl %eax, %edx -; CHECK-NEXT: movw %dx, (%rdi) -; CHECK-NEXT: retq +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: orl %eax, %edx # [1:0.33] +; CHECK-NEXT: movw %dx, (%rdi) # [1:1.00] +; CHECK-NEXT: retq # [5:1.00] %extbit = zext i1 %bit to i24 %b = load i24, i24* %a, align 1 %extbit.shl = shl nuw nsw i24 %extbit, 13 @@ -63,20 +67,21 @@ define void @i56_or(i56* %a) { ; CHECK-LABEL: i56_or: ; CHECK: # BB#0: -; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: movzbl 6(%rdi), %ecx -; CHECK-NEXT: movl (%rdi), %edx -; CHECK-NEXT: movb %cl, 6(%rdi) +; CHECK-NEXT: movzwl 4(%rdi), %eax # [5:0.50] +; CHECK-NEXT: movzbl 6(%rdi), %ecx # [5:0.50] +; CHECK-NEXT: movl (%rdi), %edx # [4:0.50] +; CHECK-NEXT: movb %cl, 6(%rdi) # [1:1.00] ; CHECK-NEXT: # kill: %ECX %ECX %RCX %RCX -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: shll $16, %ecx # [1:0.50] +; CHECK-NEXT: orl %eax, %ecx # [1:0.33] +; CHECK-NEXT: shlq $32, %rcx # [1:0.50] +; CHECK-NEXT: orq %rcx, %rdx # [1:0.33] ; CHECK-NEXT: orq $384, %rdx # imm = 0x180 -; CHECK-NEXT: movl %edx, (%rdi) -; CHECK-NEXT: shrq $32, %rdx -; CHECK-NEXT: movw %dx, 4(%rdi) -; CHECK-NEXT: retq +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: movl %edx, (%rdi) # [1:1.00] +; CHECK-NEXT: shrq $32, %rdx # [1:0.50] +; CHECK-NEXT: movw %dx, 4(%rdi) # [1:1.00] +; CHECK-NEXT: retq # [5:1.00] %aa = load i56, i56* %a, align 1 %b = or i56 %aa, 384 store i56 %b, i56* %a, align 1 @@ -86,22 +91,24 @@ define void @i56_and_or(i56* %a) { ; CHECK-LABEL: i56_and_or: ; CHECK: # BB#0: -; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: movzbl 6(%rdi), %ecx -; CHECK-NEXT: movl (%rdi), %edx -; CHECK-NEXT: movb %cl, 6(%rdi) +; CHECK-NEXT: movzwl 4(%rdi), %eax # [5:0.50] +; CHECK-NEXT: movzbl 6(%rdi), %ecx # [5:0.50] +; CHECK-NEXT: movl (%rdi), %edx # [4:0.50] +; CHECK-NEXT: movb %cl, 6(%rdi) # [1:1.00] ; CHECK-NEXT: # kill: %ECX %ECX %RCX %RCX -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: shll $16, %ecx # [1:0.50] +; CHECK-NEXT: orl %eax, %ecx # [1:0.33] +; CHECK-NEXT: shlq $32, %rcx # [1:0.50] +; CHECK-NEXT: orq %rcx, %rdx # [1:0.33] ; CHECK-NEXT: orq $384, %rdx # imm = 0x180 +; CHECK-NEXT: # [1:0.33] ; CHECK-NEXT: movabsq $72057594037927808, %rax # imm = 0xFFFFFFFFFFFF80 -; CHECK-NEXT: andq %rdx, %rax -; CHECK-NEXT: movl %eax, (%rdi) -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: movw %ax, 4(%rdi) -; CHECK-NEXT: retq +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: andq %rdx, %rax # [1:0.33] +; CHECK-NEXT: movl %eax, (%rdi) # [1:1.00] +; CHECK-NEXT: shrq $32, %rax # [1:0.50] +; CHECK-NEXT: movw %ax, 4(%rdi) # [1:1.00] +; CHECK-NEXT: retq # [5:1.00] %b = load i56, i56* %a, align 1 %c = and i56 %b, -128 %d = or i56 %c, 384 @@ -112,24 +119,25 @@ define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { ; CHECK-LABEL: i56_insert_bit: ; CHECK: # BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: movzwl 4(%rdi), %ecx -; CHECK-NEXT: movzbl 6(%rdi), %edx -; CHECK-NEXT: movl (%rdi), %esi -; CHECK-NEXT: movb %dl, 6(%rdi) +; CHECK-NEXT: movzbl %sil, %eax # [1:0.33] +; CHECK-NEXT: movzwl 4(%rdi), %ecx # [5:0.50] +; CHECK-NEXT: movzbl 6(%rdi), %edx # [5:0.50] +; CHECK-NEXT: movl (%rdi), %esi # [4:0.50] +; CHECK-NEXT: movb %dl, 6(%rdi) # [1:1.00] ; CHECK-NEXT: # kill: %EDX %EDX %RDX %RDX -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: shlq $32, %rdx -; CHECK-NEXT: orq %rdx, %rsi -; CHECK-NEXT: shlq $13, %rax +; CHECK-NEXT: shll $16, %edx # [1:0.50] +; CHECK-NEXT: orl %ecx, %edx # [1:0.33] +; CHECK-NEXT: shlq $32, %rdx # [1:0.50] +; CHECK-NEXT: orq %rdx, %rsi # [1:0.33] +; CHECK-NEXT: shlq $13, %rax # [1:0.50] ; CHECK-NEXT: movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: orq %rax, %rcx -; CHECK-NEXT: movl %ecx, (%rdi) -; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: movw %cx, 4(%rdi) -; CHECK-NEXT: retq +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: andq %rsi, %rcx # [1:0.33] +; CHECK-NEXT: orq %rax, %rcx # [1:0.33] +; CHECK-NEXT: movl %ecx, (%rdi) # [1:1.00] +; CHECK-NEXT: shrq $32, %rcx # [1:0.50] +; CHECK-NEXT: movw %cx, 4(%rdi) # [1:1.00] +; CHECK-NEXT: retq # [5:1.00] %extbit = zext i1 %bit to i56 %b = load i56, i56* %a, align 1 %extbit.shl = shl nuw nsw i56 %extbit, 13 Index: test/CodeGen/X86/memset-2.ll =================================================================== --- test/CodeGen/X86/memset-2.ll +++ test/CodeGen/X86/memset-2.ll @@ -1,16 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by update_test_checks.py ; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=yonah < %s | FileCheck %s define fastcc void @t1() nounwind { ; CHECK-LABEL: t1: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: subl $16, %esp -; CHECK-NEXT: pushl $188 -; CHECK-NEXT: pushl $0 -; CHECK-NEXT: pushl $0 +; CHECK-NEXT: subl $16, %esp ## [1:0.33] +; CHECK-NEXT: pushl $188 ## [1:1.00] +; CHECK-NEXT: pushl $0 ## [1:1.00] +; CHECK-NEXT: pushl $0 ## [1:1.00] ; CHECK-NEXT: calll _memset -; CHECK-NEXT: addl $16, %esp -; +; CHECK-NEXT: addl $16, %esp ## [1:0.33] entry: call void @llvm.memset.p0i8.i32(i8* null, i8 0, i32 188, i32 1, i1 false) unreachable @@ -19,11 +19,10 @@ define fastcc void @t2(i8 signext %c) nounwind { ; CHECK-LABEL: t2: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: subl $12, %esp -; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl $76, {{[0-9]+}}(%esp) +; CHECK-NEXT: subl $12, %esp ## [1:0.33] +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) ## [1:1.00] +; CHECK-NEXT: movl $76, {{[0-9]+}}(%esp) ## [1:1.00] ; CHECK-NEXT: calll _memset -; entry: call void @llvm.memset.p0i8.i32(i8* undef, i8 %c, i32 76, i32 1, i1 false) unreachable @@ -34,13 +33,13 @@ define void @t3(i8* nocapture %s, i8 %a) nounwind { ; CHECK-LABEL: t3: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## [4:0.50] +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## [5:0.50] ; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101 -; CHECK-NEXT: movl %ecx, 4(%eax) -; CHECK-NEXT: movl %ecx, (%eax) -; CHECK-NEXT: retl -; +; CHECK-NEXT: ## [3:1.00] +; CHECK-NEXT: movl %ecx, 4(%eax) ## [1:1.00] +; CHECK-NEXT: movl %ecx, (%eax) ## [1:1.00] +; CHECK-NEXT: retl ## [5:1.00] entry: tail call void @llvm.memset.p0i8.i32(i8* %s, i8 %a, i32 8, i32 1, i1 false) ret void @@ -49,16 +48,16 @@ define void @t4(i8* nocapture %s, i8 %a) nounwind { ; CHECK-LABEL: t4: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## [4:0.50] +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## [5:0.50] ; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101 -; CHECK-NEXT: movl %ecx, 8(%eax) -; CHECK-NEXT: movl %ecx, 4(%eax) -; CHECK-NEXT: movl %ecx, (%eax) -; CHECK-NEXT: movw %cx, 12(%eax) -; CHECK-NEXT: movb %cl, 14(%eax) -; CHECK-NEXT: retl -; +; CHECK-NEXT: ## [3:1.00] +; CHECK-NEXT: movl %ecx, 8(%eax) ## [1:1.00] +; CHECK-NEXT: movl %ecx, 4(%eax) ## [1:1.00] +; CHECK-NEXT: movl %ecx, (%eax) ## [1:1.00] +; CHECK-NEXT: movw %cx, 12(%eax) ## [1:1.00] +; CHECK-NEXT: movb %cl, 14(%eax) ## [1:1.00] +; CHECK-NEXT: retl ## [5:1.00] entry: tail call void @llvm.memset.p0i8.i32(i8* %s, i8 %a, i32 15, i32 1, i1 false) ret void Index: test/CodeGen/X86/mul-i256.ll =================================================================== --- test/CodeGen/X86/mul-i256.ll +++ test/CodeGen/X86/mul-i256.ll @@ -8,207 +8,253 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 { ; X32-LABEL: test: ; X32: # BB#0: # %entry -; X32-NEXT: pushl %ebp +; X32-NEXT: pushl %ebp # [1:1.00] ; X32-NEXT: .Lcfi0: ; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: .Lcfi1: ; X32-NEXT: .cfi_offset %ebp, -8 -; X32-NEXT: movl %esp, %ebp +; X32-NEXT: movl %esp, %ebp # [1:0.33] ; X32-NEXT: .Lcfi2: ; X32-NEXT: .cfi_def_cfa_register %ebp -; X32-NEXT: pushl %ebx -; X32-NEXT: pushl %edi -; X32-NEXT: pushl %esi -; X32-NEXT: andl $-8, %esp -; X32-NEXT: subl $168, %esp +; X32-NEXT: pushl %ebx # [1:1.00] +; X32-NEXT: pushl %edi # [1:1.00] +; X32-NEXT: pushl %esi # [1:1.00] +; X32-NEXT: andl $-8, %esp # [1:0.33] +; X32-NEXT: subl $168, %esp # [1:0.33] ; X32-NEXT: .Lcfi3: ; X32-NEXT: .cfi_offset %esi, -20 ; X32-NEXT: .Lcfi4: ; X32-NEXT: .cfi_offset %edi, -16 ; X32-NEXT: .Lcfi5: ; X32-NEXT: .cfi_offset %ebx, -12 -; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: movl 16(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 20(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 24(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 28(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 8(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 12(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 4(%eax), %ebx -; X32-NEXT: movl 12(%ebp), %eax -; X32-NEXT: movl 16(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 20(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 24(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 28(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 4(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 8(%eax), %esi -; X32-NEXT: movl 12(%eax), %edi -; X32-NEXT: leal {{[0-9]+}}(%esp), %eax -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $0 -; X32-NEXT: pushl %edi -; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: pushl %esi -; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $0 -; X32-NEXT: pushl %ebx -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl %eax +; X32-NEXT: movl 8(%ebp), %eax # [4:0.50] +; X32-NEXT: movl 16(%eax), %ecx # [4:0.50] +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl 20(%eax), %ecx # [4:0.50] +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl 24(%eax), %ecx # [4:0.50] +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl 28(%eax), %ecx # [4:0.50] +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl 8(%eax), %ecx # [4:0.50] +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl 12(%eax), %ecx # [4:0.50] +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl (%eax), %ecx # [4:0.50] +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl 4(%eax), %ebx # [4:0.50] +; X32-NEXT: movl 12(%ebp), %eax # [4:0.50] +; X32-NEXT: movl 16(%eax), %ecx # [4:0.50] +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl 20(%eax), %ecx # [4:0.50] +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl 24(%eax), %ecx # [4:0.50] +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl 28(%eax), %ecx # [4:0.50] +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl (%eax), %ecx # [4:0.50] +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl 4(%eax), %ecx # [4:0.50] +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl 8(%eax), %esi # [4:0.50] +; X32-NEXT: movl 12(%eax), %edi # [4:0.50] +; X32-NEXT: leal {{[0-9]+}}(%esp), %eax # [1:0.50] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl %edi # [1:1.00] +; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl %esi # [1:1.00] +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl %ebx # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl %eax # [1:1.00] ; X32-NEXT: calll __multi3 -; X32-NEXT: addl $32, %esp -; X32-NEXT: leal {{[0-9]+}}(%esp), %eax -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $0 -; X32-NEXT: pushl %edi -; X32-NEXT: pushl %esi -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $0 -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload -; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload -; X32-NEXT: pushl %edi -; X32-NEXT: pushl %eax +; X32-NEXT: addl $32, %esp # [1:0.33] +; X32-NEXT: leal {{[0-9]+}}(%esp), %eax # [1:0.50] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl %edi # [1:1.00] +; X32-NEXT: pushl %esi # [1:1.00] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload [4:0.50] +; X32-NEXT: # [4:0.50] +; X32-NEXT: pushl %esi # [1:1.00] +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload [4:0.50] +; X32-NEXT: # [4:0.50] +; X32-NEXT: pushl %edi # [1:1.00] +; X32-NEXT: pushl %eax # [1:1.00] ; X32-NEXT: calll __multi3 -; X32-NEXT: addl $32, %esp -; X32-NEXT: leal {{[0-9]+}}(%esp), %eax -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $0 -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $0 -; X32-NEXT: pushl %ebx -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl %eax +; X32-NEXT: addl $32, %esp # [1:0.33] +; X32-NEXT: leal {{[0-9]+}}(%esp), %eax # [1:0.50] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl %ebx # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl %eax # [1:1.00] ; X32-NEXT: calll __multi3 -; X32-NEXT: addl $32, %esp -; X32-NEXT: leal {{[0-9]+}}(%esp), %eax -; X32-NEXT: pushl %esi -; X32-NEXT: pushl %edi -; X32-NEXT: pushl %ebx -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl %eax +; X32-NEXT: addl $32, %esp # [1:0.33] +; X32-NEXT: leal {{[0-9]+}}(%esp), %eax # [1:0.50] +; X32-NEXT: pushl %esi # [1:1.00] +; X32-NEXT: pushl %edi # [1:1.00] +; X32-NEXT: pushl %ebx # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl %eax # [1:1.00] ; X32-NEXT: calll __multi3 -; X32-NEXT: addl $32, %esp -; X32-NEXT: leal {{[0-9]+}}(%esp), %eax -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $0 -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $0 -; X32-NEXT: pushl %esi -; X32-NEXT: pushl %edi -; X32-NEXT: pushl %eax +; X32-NEXT: addl $32, %esp # [1:0.33] +; X32-NEXT: leal {{[0-9]+}}(%esp), %eax # [1:0.50] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl $0 # [1:1.00] +; X32-NEXT: pushl %esi # [1:1.00] +; X32-NEXT: pushl %edi # [1:1.00] +; X32-NEXT: pushl %eax # [1:1.00] ; X32-NEXT: calll __multi3 -; X32-NEXT: addl $32, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: addl {{[0-9]+}}(%esp), %edx -; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: adcl $0, %eax -; X32-NEXT: addl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: sbbl %eax, %eax -; X32-NEXT: andl $1, %eax -; X32-NEXT: addl {{[0-9]+}}(%esp), %edi -; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: leal {{[0-9]+}}(%esp), %eax -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload -; X32-NEXT: pushl %eax +; X32-NEXT: addl $32, %esp # [1:0.33] +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # [4:0.50] +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # [4:0.50] +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # [4:0.50] +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # [4:0.50] +; X32-NEXT: addl {{[0-9]+}}(%esp), %edx # [5:0.50] +; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # [5:0.50] +; X32-NEXT: adcl $0, %ecx # [1:0.33] +; X32-NEXT: adcl $0, %eax # [1:0.33] +; X32-NEXT: addl {{[0-9]+}}(%esp), %edx # [5:0.50] +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # [5:0.50] +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # [4:0.50] +; X32-NEXT: adcl $0, %edi # [1:0.33] +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # [4:0.50] +; X32-NEXT: adcl $0, %ebx # [1:0.33] +; X32-NEXT: xorl %edx, %edx # [1:0.33] +; X32-NEXT: addl %ecx, %edi # [1:0.33] +; X32-NEXT: adcl %eax, %ebx # [1:0.33] +; X32-NEXT: adcl $0, %edx # [1:0.33] +; X32-NEXT: sbbl %eax, %eax # [1:0.33] +; X32-NEXT: andl $1, %eax # [1:0.33] +; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # [5:0.50] +; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # [5:0.50] +; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # [5:0.50] +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # [5:0.50] +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: leal {{[0-9]+}}(%esp), %eax # [1:0.50] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: pushl %eax # [1:1.00] ; X32-NEXT: calll __multi3 -; X32-NEXT: addl $32, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: addl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl %ebx, %ecx -; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload -; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl 16(%ebp), %edi -; X32-NEXT: movl %ebx, 4(%edi) -; X32-NEXT: movl 16(%ebp), %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload -; X32-NEXT: movl %edi, (%ebx) -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload -; X32-NEXT: movl %edi, 8(%ebx) -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload -; X32-NEXT: movl %edi, 12(%ebx) -; X32-NEXT: movl %esi, 16(%ebx) -; X32-NEXT: movl %ecx, 20(%ebx) -; X32-NEXT: movl %edx, 24(%ebx) -; X32-NEXT: movl %eax, 28(%ebx) -; X32-NEXT: leal -12(%ebp), %esp -; X32-NEXT: popl %esi -; X32-NEXT: popl %edi -; X32-NEXT: popl %ebx -; X32-NEXT: popl %ebp -; X32-NEXT: retl +; X32-NEXT: addl $32, %esp # [1:0.33] +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # [4:0.50] +; X32-NEXT: addl {{[0-9]+}}(%esp), %esi # [5:0.50] +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # [4:0.50] +; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # [5:0.50] +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # [4:0.50] +; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # [5:0.50] +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # [4:0.50] +; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # [5:0.50] +; X32-NEXT: addl %edi, %esi # [1:0.33] +; X32-NEXT: adcl %ebx, %ecx # [1:0.33] +; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload [5:0.50] +; X32-NEXT: # [5:0.50] +; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload [5:0.50] +; X32-NEXT: # [5:0.50] +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # [4:0.50] +; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; X32-NEXT: # [1:1.00] +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # [4:0.50] +; X32-NEXT: movl 16(%ebp), %edi # [4:0.50] +; X32-NEXT: movl %ebx, 4(%edi) # [1:1.00] +; X32-NEXT: movl 16(%ebp), %ebx # [4:0.50] +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload [4:0.50] +; X32-NEXT: # [4:0.50] +; X32-NEXT: movl %edi, (%ebx) # [1:1.00] +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload [4:0.50] +; X32-NEXT: # [4:0.50] +; X32-NEXT: movl %edi, 8(%ebx) # [1:1.00] +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload [4:0.50] +; X32-NEXT: # [4:0.50] +; X32-NEXT: movl %edi, 12(%ebx) # [1:1.00] +; X32-NEXT: movl %esi, 16(%ebx) # [1:1.00] +; X32-NEXT: movl %ecx, 20(%ebx) # [1:1.00] +; X32-NEXT: movl %edx, 24(%ebx) # [1:1.00] +; X32-NEXT: movl %eax, 28(%ebx) # [1:1.00] +; X32-NEXT: leal -12(%ebp), %esp # [1:0.50] +; X32-NEXT: popl %esi # [4:0.50] +; X32-NEXT: popl %edi # [4:0.50] +; X32-NEXT: popl %ebx # [4:0.50] +; X32-NEXT: popl %ebp # [4:0.50] +; X32-NEXT: retl # [5:1.00] ; ; X64-LABEL: test: ; X64: # BB#0: # %entry -; X64-NEXT: pushq %r15 +; X64-NEXT: pushq %r15 # [1:1.00] ; X64-NEXT: .Lcfi0: ; X64-NEXT: .cfi_def_cfa_offset 16 -; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %r14 # [1:1.00] ; X64-NEXT: .Lcfi1: ; X64-NEXT: .cfi_def_cfa_offset 24 -; X64-NEXT: pushq %r12 +; X64-NEXT: pushq %r12 # [1:1.00] ; X64-NEXT: .Lcfi2: ; X64-NEXT: .cfi_def_cfa_offset 32 -; X64-NEXT: pushq %rbx +; X64-NEXT: pushq %rbx # [1:1.00] ; X64-NEXT: .Lcfi3: ; X64-NEXT: .cfi_def_cfa_offset 40 ; X64-NEXT: .Lcfi4: @@ -219,66 +265,66 @@ ; X64-NEXT: .cfi_offset %r14, -24 ; X64-NEXT: .Lcfi7: ; X64-NEXT: .cfi_offset %r15, -16 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq (%rdi), %r14 -; X64-NEXT: movq 8(%rdi), %r8 -; X64-NEXT: movq 16(%rdi), %rcx -; X64-NEXT: movq 16(%rsi), %rbx -; X64-NEXT: movq (%rsi), %r12 -; X64-NEXT: movq 8(%rsi), %r15 -; X64-NEXT: movq 24(%rdi), %rdi -; X64-NEXT: imulq %r12, %rdi -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: imulq %r15, %rcx -; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: movq %rbx, %rdi -; X64-NEXT: imulq %r8, %rdi -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq 24(%rsi), %rbx -; X64-NEXT: imulq %r14, %rbx -; X64-NEXT: addq %rdx, %rbx -; X64-NEXT: addq %r9, %r11 -; X64-NEXT: adcq %rcx, %rbx -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rsi, %rdi -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rdi, %r14 -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: addq %rcx, %rsi -; X64-NEXT: sbbq %rcx, %rcx -; X64-NEXT: andl $1, %ecx -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: addq %rsi, %rax -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r11, %rax -; X64-NEXT: adcq %rbx, %rdx -; X64-NEXT: movq %r9, (%r10) -; X64-NEXT: movq %r14, 8(%r10) -; X64-NEXT: movq %rax, 16(%r10) -; X64-NEXT: movq %rdx, 24(%r10) -; X64-NEXT: popq %rbx -; X64-NEXT: popq %r12 -; X64-NEXT: popq %r14 -; X64-NEXT: popq %r15 -; X64-NEXT: retq +; X64-NEXT: movq %rdx, %r10 # [1:0.33] +; X64-NEXT: movq (%rdi), %r14 # [4:0.50] +; X64-NEXT: movq 8(%rdi), %r8 # [4:0.50] +; X64-NEXT: movq 16(%rdi), %rcx # [4:0.50] +; X64-NEXT: movq 16(%rsi), %rbx # [4:0.50] +; X64-NEXT: movq (%rsi), %r12 # [4:0.50] +; X64-NEXT: movq 8(%rsi), %r15 # [4:0.50] +; X64-NEXT: movq 24(%rdi), %rdi # [4:0.50] +; X64-NEXT: imulq %r12, %rdi # [3:1.00] +; X64-NEXT: movq %r12, %rax # [1:0.33] +; X64-NEXT: mulq %rcx # [3:1.00] +; X64-NEXT: movq %rax, %r9 # [1:0.33] +; X64-NEXT: addq %rdi, %rdx # [1:0.33] +; X64-NEXT: imulq %r15, %rcx # [3:1.00] +; X64-NEXT: addq %rdx, %rcx # [1:0.33] +; X64-NEXT: movq %rbx, %rdi # [1:0.33] +; X64-NEXT: imulq %r8, %rdi # [3:1.00] +; X64-NEXT: movq %rbx, %rax # [1:0.33] +; X64-NEXT: mulq %r14 # [3:1.00] +; X64-NEXT: movq %rax, %r11 # [1:0.33] +; X64-NEXT: addq %rdi, %rdx # [1:0.33] +; X64-NEXT: movq 24(%rsi), %rbx # [4:0.50] +; X64-NEXT: imulq %r14, %rbx # [3:1.00] +; X64-NEXT: addq %rdx, %rbx # [1:0.33] +; X64-NEXT: addq %r9, %r11 # [1:0.33] +; X64-NEXT: adcq %rcx, %rbx # [1:0.33] +; X64-NEXT: movq %r14, %rax # [1:0.33] +; X64-NEXT: mulq %r12 # [3:1.00] +; X64-NEXT: movq %rdx, %rsi # [1:0.33] +; X64-NEXT: movq %rax, %r9 # [1:0.33] +; X64-NEXT: movq %r8, %rax # [1:0.33] +; X64-NEXT: mulq %r12 # [3:1.00] +; X64-NEXT: movq %rdx, %rcx # [1:0.33] +; X64-NEXT: movq %rax, %rdi # [1:0.33] +; X64-NEXT: addq %rsi, %rdi # [1:0.33] +; X64-NEXT: adcq $0, %rcx # [1:0.33] +; X64-NEXT: movq %r14, %rax # [1:0.33] +; X64-NEXT: mulq %r15 # [3:1.00] +; X64-NEXT: movq %rdx, %rsi # [1:0.33] +; X64-NEXT: movq %rax, %r14 # [1:0.33] +; X64-NEXT: addq %rdi, %r14 # [1:0.33] +; X64-NEXT: adcq $0, %rsi # [1:0.33] +; X64-NEXT: addq %rcx, %rsi # [1:0.33] +; X64-NEXT: sbbq %rcx, %rcx # [1:0.33] +; X64-NEXT: andl $1, %ecx # [1:0.33] +; X64-NEXT: movq %r8, %rax # [1:0.33] +; X64-NEXT: mulq %r15 # [3:1.00] +; X64-NEXT: addq %rsi, %rax # [1:0.33] +; X64-NEXT: adcq %rcx, %rdx # [1:0.33] +; X64-NEXT: addq %r11, %rax # [1:0.33] +; X64-NEXT: adcq %rbx, %rdx # [1:0.33] +; X64-NEXT: movq %r9, (%r10) # [1:1.00] +; X64-NEXT: movq %r14, 8(%r10) # [1:1.00] +; X64-NEXT: movq %rax, 16(%r10) # [1:1.00] +; X64-NEXT: movq %rdx, 24(%r10) # [1:1.00] +; X64-NEXT: popq %rbx # [4:0.50] +; X64-NEXT: popq %r12 # [4:0.50] +; X64-NEXT: popq %r14 # [4:0.50] +; X64-NEXT: popq %r15 # [4:0.50] +; X64-NEXT: retq # [5:1.00] entry: %av = load i256, i256* %a %bv = load i256, i256* %b Index: test/CodeGen/X86/pr21792.ll =================================================================== --- test/CodeGen/X86/pr21792.ll +++ test/CodeGen/X86/pr21792.ll @@ -9,27 +9,29 @@ define void @func(<4 x float> %vx) { ; CHECK-LABEL: func: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: pushq %rax +; CHECK-NEXT: pushq %rax # [1:1.00] ; CHECK-NEXT: .Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pextrq $1, %xmm0, %rdx -; CHECK-NEXT: movq %rdx, %rcx -; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: movd %xmm0, %rax -; CHECK-NEXT: movq %rax, %r9 -; CHECK-NEXT: shrq $32, %r9 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 # [5:0.50] +; CHECK-NEXT: pextrq $1, %xmm0, %rdx # [1:0.50] +; CHECK-NEXT: movq %rdx, %rcx # [1:0.33] +; CHECK-NEXT: shrq $32, %rcx # [1:0.50] +; CHECK-NEXT: movd %xmm0, %rax # [1:0.33] +; CHECK-NEXT: movq %rax, %r9 # [1:0.33] +; CHECK-NEXT: shrq $32, %r9 # [1:0.50] ; CHECK-NEXT: andl $2032, %eax # imm = 0x7F0 -; CHECK-NEXT: leaq stuff(%rax), %rdi -; CHECK-NEXT: leaq stuff(%r9), %rsi +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: leaq stuff(%rax), %rdi # [1:0.50] +; CHECK-NEXT: leaq stuff(%r9), %rsi # [1:0.50] ; CHECK-NEXT: andl $2032, %edx # imm = 0x7F0 -; CHECK-NEXT: leaq stuff(%rdx), %rdx -; CHECK-NEXT: leaq stuff(%rcx), %rcx -; CHECK-NEXT: leaq stuff+8(%rax), %r8 -; CHECK-NEXT: leaq stuff+8(%r9), %r9 +; CHECK-NEXT: # [1:0.33] +; CHECK-NEXT: leaq stuff(%rdx), %rdx # [1:0.50] +; CHECK-NEXT: leaq stuff(%rcx), %rcx # [1:0.50] +; CHECK-NEXT: leaq stuff+8(%rax), %r8 # [1:0.50] +; CHECK-NEXT: leaq stuff+8(%r9), %r9 # [1:0.50] ; CHECK-NEXT: callq toto -; CHECK-NEXT: popq %rax -; CHECK-NEXT: retq +; CHECK-NEXT: popq %rax # [4:0.50] +; CHECK-NEXT: retq # [5:1.00] entry: %tmp2 = bitcast <4 x float> %vx to <2 x i64> %and.i = and <2 x i64> %tmp2, Index: test/CodeGen/X86/pr32241.ll =================================================================== --- test/CodeGen/X86/pr32241.ll +++ test/CodeGen/X86/pr32241.ll @@ -4,50 +4,54 @@ define i32 @_Z3foov() { ; CHECK-LABEL: _Z3foov: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: subl $20, %esp +; CHECK-NEXT: subl $20, %esp # [1:0.25] ; CHECK-NEXT: .Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: movw $10959, {{[0-9]+}}(%esp) # imm = 0x2ACF +; CHECK-NEXT: # [1:1.00] ; CHECK-NEXT: movw $-15498, {{[0-9]+}}(%esp) # imm = 0xC376 +; CHECK-NEXT: # [1:1.00] ; CHECK-NEXT: movw $19417, {{[0-9]+}}(%esp) # imm = 0x4BD9 -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %cx +; CHECK-NEXT: # [1:1.00] +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax # [4:0.50] +; CHECK-NEXT: movw {{[0-9]+}}(%esp), %cx # [5:0.50] ; CHECK-NEXT: kxnorw %k0, %k0, %k0 ; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: testw %cx, %cx -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill -; CHECK-NEXT: jne .LBB0_2 -; CHECK-NEXT: jmp .LBB0_1 +; CHECK-NEXT: testw %cx, %cx # [1:0.25] +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; CHECK-NEXT: # [1:1.00] +; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill [1:0.00] +; CHECK-NEXT: jne .LBB0_2 # [1:0.50] +; CHECK-NEXT: jmp .LBB0_1 # [1:0.50] ; CHECK-NEXT: .LBB0_1: # %lor.rhs -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: xorl %eax, %eax # [1:0.25] ; CHECK-NEXT: kmovd %eax, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill -; CHECK-NEXT: jmp .LBB0_2 +; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill [1:0.00] +; CHECK-NEXT: jmp .LBB0_2 # [1:0.50] ; CHECK-NEXT: .LBB0_2: # %lor.end -; CHECK-NEXT: kmovw {{[0-9]+}}(%esp), %k0 # 2-byte Reload +; CHECK-NEXT: kmovw {{[0-9]+}}(%esp), %k0 # 2-byte Reload [4:0.00] ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: kshiftrw $15, %k1, %k1 -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill -; CHECK-NEXT: kmovw %k1, {{[0-9]+}}(%esp) # 2-byte Spill -; CHECK-NEXT: jne .LBB0_4 -; CHECK-NEXT: jmp .LBB0_3 +; CHECK-NEXT: movb $1, %al # [1:0.25] +; CHECK-NEXT: testb %al, %al # [1:0.25] +; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill [1:0.00] +; CHECK-NEXT: kmovw %k1, {{[0-9]+}}(%esp) # 2-byte Spill [1:0.00] +; CHECK-NEXT: jne .LBB0_4 # [1:0.50] +; CHECK-NEXT: jmp .LBB0_3 # [1:0.50] ; CHECK-NEXT: .LBB0_3: # %lor.rhs4 -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: xorl %eax, %eax # [1:0.25] ; CHECK-NEXT: kmovd %eax, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill -; CHECK-NEXT: jmp .LBB0_4 +; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill [1:0.00] +; CHECK-NEXT: jmp .LBB0_4 # [1:0.50] ; CHECK-NEXT: .LBB0_4: # %lor.end5 -; CHECK-NEXT: kmovw {{[0-9]+}}(%esp), %k0 # 2-byte Reload +; CHECK-NEXT: kmovw {{[0-9]+}}(%esp), %k0 # 2-byte Reload [4:0.00] ; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: movw %ax, %cx -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl $20, %esp -; CHECK-NEXT: retl +; CHECK-NEXT: andl $1, %eax # [1:0.25] +; CHECK-NEXT: movw %ax, %cx # [1:0.25] +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp) # [1:1.00] +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax # [4:0.50] +; CHECK-NEXT: addl $20, %esp # [1:0.25] +; CHECK-NEXT: retl # [1:1.00] entry: %aa = alloca i16, align 2 %bb = alloca i16, align 2 Index: test/CodeGen/X86/pr32256.ll =================================================================== --- test/CodeGen/X86/pr32256.ll +++ test/CodeGen/X86/pr32256.ll @@ -7,40 +7,40 @@ define void @_Z1av() { ; CHECK-LABEL: _Z1av: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: subl $6, %esp +; CHECK-NEXT: subl $6, %esp # [1:0.25] ; CHECK-NEXT: .Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 10 -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: xorl %eax, %eax # [1:0.25] ; CHECK-NEXT: kmovd %eax, %k0 -; CHECK-NEXT: movb c, %cl +; CHECK-NEXT: movb c, %cl # [4:0.50] ; CHECK-NEXT: # implicit-def: %EAX -; CHECK-NEXT: movb %cl, %al -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: movb %cl, %al # [1:0.25] +; CHECK-NEXT: andl $1, %eax # [1:0.25] ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: kxnorw %k0, %k0, %k3 ; CHECK-NEXT: kshiftrw $15, %k3, %k3 ; CHECK-NEXT: kxorw %k3, %k1, %k1 ; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: movb %al, %cl -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: kmovw %k2, {{[0-9]+}}(%esp) # 2-byte Spill -; CHECK-NEXT: kmovw %k0, (%esp) # 2-byte Spill -; CHECK-NEXT: jne .LBB0_1 -; CHECK-NEXT: jmp .LBB0_2 +; CHECK-NEXT: movb %al, %cl # [1:0.25] +; CHECK-NEXT: testb $1, %cl # [1:0.25] +; CHECK-NEXT: kmovw %k2, {{[0-9]+}}(%esp) # 2-byte Spill [1:0.00] +; CHECK-NEXT: kmovw %k0, (%esp) # 2-byte Spill [1:0.00] +; CHECK-NEXT: jne .LBB0_1 # [1:0.50] +; CHECK-NEXT: jmp .LBB0_2 # [1:0.50] ; CHECK-NEXT: .LBB0_1: # %land.rhs -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: xorl %eax, %eax # [1:0.25] ; CHECK-NEXT: kmovd %eax, %k0 -; CHECK-NEXT: kmovw %k0, (%esp) # 2-byte Spill -; CHECK-NEXT: jmp .LBB0_2 +; CHECK-NEXT: kmovw %k0, (%esp) # 2-byte Spill [1:0.00] +; CHECK-NEXT: jmp .LBB0_2 # [1:0.50] ; CHECK-NEXT: .LBB0_2: # %land.end -; CHECK-NEXT: kmovw (%esp), %k0 # 2-byte Reload +; CHECK-NEXT: kmovw (%esp), %k0 # 2-byte Reload [4:0.00] ; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: movb %al, %cl -; CHECK-NEXT: andb $1, %cl -; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) -; CHECK-NEXT: addl $6, %esp -; CHECK-NEXT: retl +; CHECK-NEXT: movb %al, %cl # [1:0.25] +; CHECK-NEXT: andb $1, %cl # [1:0.25] +; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) # [1:1.00] +; CHECK-NEXT: addl $6, %esp # [1:0.25] +; CHECK-NEXT: retl # [1:1.00] entry: %b = alloca i8, align 1 %0 = load i8, i8* @c, align 1 Index: test/CodeGen/X86/pr32329.ll =================================================================== --- test/CodeGen/X86/pr32329.ll +++ test/CodeGen/X86/pr32329.ll @@ -16,16 +16,16 @@ define void @foo() local_unnamed_addr { ; 686-LABEL: foo: ; 686: # BB#0: # %entry -; 686-NEXT: pushl %ebp +; 686-NEXT: pushl %ebp # [1:1.00] ; 686-NEXT: .Lcfi0: ; 686-NEXT: .cfi_def_cfa_offset 8 -; 686-NEXT: pushl %ebx +; 686-NEXT: pushl %ebx # [1:1.00] ; 686-NEXT: .Lcfi1: ; 686-NEXT: .cfi_def_cfa_offset 12 -; 686-NEXT: pushl %edi +; 686-NEXT: pushl %edi # [1:1.00] ; 686-NEXT: .Lcfi2: ; 686-NEXT: .cfi_def_cfa_offset 16 -; 686-NEXT: pushl %esi +; 686-NEXT: pushl %esi # [1:1.00] ; 686-NEXT: .Lcfi3: ; 686-NEXT: .cfi_def_cfa_offset 20 ; 686-NEXT: .Lcfi4: @@ -36,62 +36,66 @@ ; 686-NEXT: .cfi_offset %ebx, -12 ; 686-NEXT: .Lcfi7: ; 686-NEXT: .cfi_offset %ebp, -8 -; 686-NEXT: movl obj, %edx -; 686-NEXT: movsbl var_27, %eax -; 686-NEXT: movzwl var_2, %esi -; 686-NEXT: movl var_310, %ecx -; 686-NEXT: imull %eax, %ecx -; 686-NEXT: addl var_24, %ecx +; 686-NEXT: movl obj, %edx # [4:0.50] +; 686-NEXT: movsbl var_27, %eax # [4:0.50] +; 686-NEXT: movzwl var_2, %esi # [4:0.50] +; 686-NEXT: movl var_310, %ecx # [4:0.50] +; 686-NEXT: imull %eax, %ecx # [4:1.00] +; 686-NEXT: addl var_24, %ecx # [5:0.50] ; 686-NEXT: andl $4194303, %edx # imm = 0x3FFFFF -; 686-NEXT: leal (%edx,%edx), %ebx -; 686-NEXT: subl %eax, %ebx -; 686-NEXT: movl %ebx, %edi -; 686-NEXT: subl %esi, %edi -; 686-NEXT: imull %edi, %ecx +; 686-NEXT: # [1:0.25] +; 686-NEXT: leal (%edx,%edx), %ebx # [1:0.50] +; 686-NEXT: subl %eax, %ebx # [1:0.25] +; 686-NEXT: movl %ebx, %edi # [1:0.25] +; 686-NEXT: subl %esi, %edi # [1:0.25] +; 686-NEXT: imull %edi, %ecx # [4:1.00] ; 686-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71 -; 686-NEXT: movl $9, %esi -; 686-NEXT: xorl %ebp, %ebp -; 686-NEXT: shldl %cl, %esi, %ebp -; 686-NEXT: shll %cl, %esi -; 686-NEXT: testb $32, %cl -; 686-NEXT: cmovnel %esi, %ebp -; 686-NEXT: movl $0, %ecx -; 686-NEXT: cmovnel %ecx, %esi -; 686-NEXT: cmpl %edx, %edi -; 686-NEXT: movl %ebp, var_50+4 -; 686-NEXT: movl %esi, var_50 -; 686-NEXT: setge var_205 -; 686-NEXT: imull %eax, %ebx -; 686-NEXT: movb %bl, var_218 -; 686-NEXT: popl %esi -; 686-NEXT: popl %edi -; 686-NEXT: popl %ebx -; 686-NEXT: popl %ebp -; 686-NEXT: retl +; 686-NEXT: # [1:0.25] +; 686-NEXT: movl $9, %esi # [1:0.25] +; 686-NEXT: xorl %ebp, %ebp # [1:0.25] +; 686-NEXT: shldl %cl, %esi, %ebp # [3:0.25] +; 686-NEXT: shll %cl, %esi # [2:1.50] +; 686-NEXT: testb $32, %cl # [1:0.25] +; 686-NEXT: cmovnel %esi, %ebp # [2:0.50] +; 686-NEXT: movl $0, %ecx # [1:0.25] +; 686-NEXT: cmovnel %ecx, %esi # [2:0.50] +; 686-NEXT: cmpl %edx, %edi # [1:0.25] +; 686-NEXT: movl %ebp, var_50+4 # [1:1.00] +; 686-NEXT: movl %esi, var_50 # [1:1.00] +; 686-NEXT: setge var_205 # [1:1.00] +; 686-NEXT: imull %eax, %ebx # [4:1.00] +; 686-NEXT: movb %bl, var_218 # [1:1.00] +; 686-NEXT: popl %esi # [4:0.50] +; 686-NEXT: popl %edi # [4:0.50] +; 686-NEXT: popl %ebx # [4:0.50] +; 686-NEXT: popl %ebp # [4:0.50] +; 686-NEXT: retl # [1:1.00] ; ; X64-LABEL: foo: ; X64: # BB#0: # %entry -; X64-NEXT: movl {{.*}}(%rip), %eax -; X64-NEXT: movsbl {{.*}}(%rip), %r9d -; X64-NEXT: movzwl {{.*}}(%rip), %r8d -; X64-NEXT: movl {{.*}}(%rip), %esi -; X64-NEXT: imull %r9d, %esi -; X64-NEXT: addl {{.*}}(%rip), %esi +; X64-NEXT: movl {{.*}}(%rip), %eax # [4:0.50] +; X64-NEXT: movsbl {{.*}}(%rip), %r9d # [4:0.50] +; X64-NEXT: movzwl {{.*}}(%rip), %r8d # [4:0.50] +; X64-NEXT: movl {{.*}}(%rip), %esi # [4:0.50] +; X64-NEXT: imull %r9d, %esi # [4:1.00] +; X64-NEXT: addl {{.*}}(%rip), %esi # [5:0.50] ; X64-NEXT: andl $4194303, %eax # imm = 0x3FFFFF -; X64-NEXT: leal (%rax,%rax), %edi -; X64-NEXT: subl %r9d, %edi -; X64-NEXT: movl %edi, %edx -; X64-NEXT: subl %r8d, %edx -; X64-NEXT: imull %edx, %esi +; X64-NEXT: # [1:0.25] +; X64-NEXT: leal (%rax,%rax), %edi # [1:0.50] +; X64-NEXT: subl %r9d, %edi # [1:0.25] +; X64-NEXT: movl %edi, %edx # [1:0.25] +; X64-NEXT: subl %r8d, %edx # [1:0.25] +; X64-NEXT: imull %edx, %esi # [4:1.00] ; X64-NEXT: addl $-1437483407, %esi # imm = 0xAA51BE71 -; X64-NEXT: movl $9, %ecx -; X64-NEXT: shlxq %rsi, %rcx, %rcx -; X64-NEXT: movq %rcx, {{.*}}(%rip) -; X64-NEXT: cmpl %eax, %edx -; X64-NEXT: setge {{.*}}(%rip) -; X64-NEXT: imull %r9d, %edi -; X64-NEXT: movb %dil, {{.*}}(%rip) -; X64-NEXT: retq +; X64-NEXT: # [1:0.25] +; X64-NEXT: movl $9, %ecx # [1:0.25] +; X64-NEXT: shlxq %rsi, %rcx, %rcx # [1:0.50] +; X64-NEXT: movq %rcx, {{.*}}(%rip) # [1:1.00] +; X64-NEXT: cmpl %eax, %edx # [1:0.25] +; X64-NEXT: setge {{.*}}(%rip) # [1:1.00] +; X64-NEXT: imull %r9d, %edi # [4:1.00] +; X64-NEXT: movb %dil, {{.*}}(%rip) # [1:1.00] +; X64-NEXT: retq # [1:1.00] entry: %bf.load = load i32, i32* bitcast (%struct.AA* @obj to i32*), align 8 %bf.clear = shl i32 %bf.load, 1 Index: test/CodeGen/X86/pr32451.ll =================================================================== --- test/CodeGen/X86/pr32451.ll +++ test/CodeGen/X86/pr32451.ll @@ -9,38 +9,43 @@ define i8** @japi1_convert_690(i8**, i8***, i32) { ; CHECK-LABEL: japi1_convert_690: ; CHECK: # BB#0: # %top -; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl %ebx # [1:1.00] ; CHECK-NEXT: .Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: subl $16, %esp # [1:0.25] ; CHECK-NEXT: .Lcfi1: ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: .Lcfi2: ; CHECK-NEXT: .cfi_offset %ebx, -8 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax # [4:0.50] +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; CHECK-NEXT: # [1:1.00] ; CHECK-NEXT: calll julia.gc_root_decl -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; CHECK-NEXT: # [1:1.00] ; CHECK-NEXT: calll jl_get_ptls_states -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload -; CHECK-NEXT: movl 4(%ecx), %edx -; CHECK-NEXT: movb (%edx), %bl +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload [4:0.50] +; CHECK-NEXT: # [4:0.50] +; CHECK-NEXT: movl 4(%ecx), %edx # [4:0.50] +; CHECK-NEXT: movb (%edx), %bl # [4:0.50] ; CHECK-NEXT: # implicit-def: %EDX -; CHECK-NEXT: movb %bl, %dl -; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: movb %bl, %dl # [1:0.25] +; CHECK-NEXT: andl $1, %edx # [1:0.25] ; CHECK-NEXT: kmovw %edx, %k0 ; CHECK-NEXT: kmovw %k0, %edx -; CHECK-NEXT: movb %dl, %bl -; CHECK-NEXT: andb $1, %bl -; CHECK-NEXT: movzbl %bl, %edx -; CHECK-NEXT: movl %edx, (%esp) -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill +; CHECK-NEXT: movb %dl, %bl # [1:0.25] +; CHECK-NEXT: andb $1, %bl # [1:0.25] +; CHECK-NEXT: movzbl %bl, %edx # [1:0.25] +; CHECK-NEXT: movl %edx, (%esp) # [1:1.00] +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill [1:1.00] +; CHECK-NEXT: # [1:1.00] ; CHECK-NEXT: calll jl_box_int32 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload -; CHECK-NEXT: movl %eax, (%ecx) -; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: popl %ebx -; CHECK-NEXT: retl +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload [4:0.50] +; CHECK-NEXT: # [4:0.50] +; CHECK-NEXT: movl %eax, (%ecx) # [1:1.00] +; CHECK-NEXT: addl $16, %esp # [1:0.25] +; CHECK-NEXT: popl %ebx # [4:0.50] +; CHECK-NEXT: retl # [1:1.00] top: %3 = alloca i8*** store volatile i8*** %1, i8**** %3 Index: test/CodeGen/X86/setcc-lowering.ll =================================================================== --- test/CodeGen/X86/setcc-lowering.ll +++ test/CodeGen/X86/setcc-lowering.ll @@ -23,16 +23,16 @@ ; ; KNL-32-LABEL: pr25080: ; KNL-32: # BB#0: # %entry -; KNL-32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1 -; KNL-32-NEXT: vpand %ymm1, %ymm0, %ymm0 -; KNL-32-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; KNL-32-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; KNL-32-NEXT: movb $15, %al +; KNL-32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1 # [4:0.50] +; KNL-32-NEXT: vpand %ymm1, %ymm0, %ymm0 # [1:0.33] +; KNL-32-NEXT: vpxor %ymm1, %ymm1, %ymm1 # [1:0.33] +; KNL-32-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # [?:0.000000e+00] +; KNL-32-NEXT: movb $15, %al # [1:0.25] ; KNL-32-NEXT: kmovw %eax, %k1 ; KNL-32-NEXT: korw %k1, %k0, %k1 ; KNL-32-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-32-NEXT: vpmovqw %zmm0, %xmm0 -; KNL-32-NEXT: retl +; KNL-32-NEXT: retl # [1:1.00] entry: %0 = trunc <8 x i32> %a to <8 x i23> %1 = icmp eq <8 x i23> %0, zeroinitializer @@ -56,27 +56,28 @@ ; ; KNL-32-LABEL: pr26232: ; KNL-32: # BB#0: # %for_loop599.preheader -; KNL-32-NEXT: pushl %esi +; KNL-32-NEXT: pushl %esi # [1:1.00] ; KNL-32-NEXT: .Lcfi0: ; KNL-32-NEXT: .cfi_def_cfa_offset 8 ; KNL-32-NEXT: .Lcfi1: ; KNL-32-NEXT: .cfi_offset %esi, -8 -; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; KNL-32-NEXT: movw $-1, %dx +; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %eax # [4:0.50] +; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx # [4:0.50] +; KNL-32-NEXT: movw $-1, %dx # [1:0.25] ; KNL-32-NEXT: .p2align 4, 0x90 ; KNL-32-NEXT: .LBB1_1: # %for_loop599 ; KNL-32-NEXT: # =>This Inner Loop Header: Depth=1 ; KNL-32-NEXT: cmpl $65536, %ecx # imm = 0x10000 -; KNL-32-NEXT: movl %eax, %esi -; KNL-32-NEXT: sbbl $0, %esi -; KNL-32-NEXT: movl $0, %esi -; KNL-32-NEXT: cmovlw %dx, %si -; KNL-32-NEXT: testw %si, %si -; KNL-32-NEXT: jne .LBB1_1 +; KNL-32-NEXT: # [1:0.25] +; KNL-32-NEXT: movl %eax, %esi # [1:0.25] +; KNL-32-NEXT: sbbl $0, %esi # [2:0.50] +; KNL-32-NEXT: movl $0, %esi # [1:0.25] +; KNL-32-NEXT: cmovlw %dx, %si # [2:0.50] +; KNL-32-NEXT: testw %si, %si # [1:0.25] +; KNL-32-NEXT: jne .LBB1_1 # [1:0.50] ; KNL-32-NEXT: # BB#2: # %for_exit600 -; KNL-32-NEXT: popl %esi -; KNL-32-NEXT: retl +; KNL-32-NEXT: popl %esi # [4:0.50] +; KNL-32-NEXT: retl # [1:1.00] allocas: br label %for_test11.preheader Index: test/CodeGen/X86/sse41-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/sse41-intrinsics-x86.ll +++ test/CodeGen/X86/sse41-intrinsics-x86.ll @@ -12,10 +12,15 @@ ; SSE41-NEXT: movapd %xmm3, %xmm0 ## encoding: [0x66,0x0f,0x28,0xc3] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_blendvpd: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4b,0xc1,0x20] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_blendvpd: +; AVX2: ## BB#0: +; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4b,0xc1,0x20] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_blendvpd: +; SKX: ## BB#0: +; SKX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4b,0xc1,0x20][2:2.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ; <<2 x double>> [#uses=1] ret <2 x double> %res } @@ -31,10 +36,15 @@ ; SSE41-NEXT: movaps %xmm3, %xmm0 ## encoding: [0x0f,0x28,0xc3] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_blendvps: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4a,0xc1,0x20] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_blendvps: +; AVX2: ## BB#0: +; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4a,0xc1,0x20] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_blendvps: +; SKX: ## BB#0: +; SKX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4a,0xc1,0x20][2:2.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -47,10 +57,15 @@ ; SSE41-NEXT: dppd $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x41,0xc1,0x07] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_dppd: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x41,0xc1,0x07] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_dppd: +; AVX2: ## BB#0: +; AVX2-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x41,0xc1,0x07] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_dppd: +; SKX: ## BB#0: +; SKX-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x41,0xc1,0x07][9:1.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1] ret <2 x double> %res } @@ -63,10 +78,15 @@ ; SSE41-NEXT: dpps $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x40,0xc1,0x07] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_dpps: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x40,0xc1,0x07] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_dpps: +; AVX2: ## BB#0: +; AVX2-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x40,0xc1,0x07] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_dpps: +; SKX: ## BB#0: +; SKX-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x40,0xc1,0x07][14:2.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -76,21 +96,18 @@ define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) { ; SSE41-LABEL: test_x86_sse41_insertps: ; SSE41: ## BB#0: -; SSE41-NEXT: insertps $17, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x11] -; SSE41-NEXT: ## xmm0 = zero,xmm1[0],xmm0[2,3] +; SSE41-NEXT: insertps $17, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x11]xmm0 = zero,xmm1[0],xmm0[2,3] ; SSE41-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_x86_sse41_insertps: ; AVX2: ## BB#0: -; AVX2-NEXT: vinsertps $17, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x11] -; AVX2-NEXT: ## xmm0 = zero,xmm1[0],xmm0[2,3] +; AVX2-NEXT: vinsertps $17, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x11]xmm0 = zero,xmm1[0],xmm0[2,3] ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_x86_sse41_insertps: ; SKX: ## BB#0: -; SKX-NEXT: vinsertps $17, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x11] -; SKX-NEXT: ## xmm0 = zero,xmm1[0],xmm0[2,3] -; SKX-NEXT: retl ## encoding: [0xc3] +; SKX-NEXT: vinsertps $17, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x11]xmm0 = zero,xmm1[0],xmm0[2,3][1:1.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -104,10 +121,15 @@ ; SSE41-NEXT: mpsadbw $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x42,0xc1,0x07] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_mpsadbw: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x42,0xc1,0x07] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_mpsadbw: +; AVX2: ## BB#0: +; AVX2-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x42,0xc1,0x07] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_mpsadbw: +; SKX: ## BB#0: +; SKX-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x42,0xc1,0x07][6:2.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } @@ -127,8 +149,8 @@ ; ; SKX-LABEL: test_x86_sse41_packusdw: ; SKX: ## BB#0: -; SKX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SKX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1][1:1.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } @@ -144,10 +166,15 @@ ; SSE41-NEXT: movdqa %xmm3, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc3] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_pblendvb: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4c,0xc1,0x20] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_pblendvb: +; AVX2: ## BB#0: +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4c,0xc1,0x20] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_pblendvb: +; SKX: ## BB#0: +; SKX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4c,0xc1,0x20][2:2.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ; <<16 x i8>> [#uses=1] ret <16 x i8> %res } @@ -160,10 +187,15 @@ ; SSE41-NEXT: phminposuw %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x41,0xc0] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_phminposuw: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vphminposuw %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x41,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_phminposuw: +; AVX2: ## BB#0: +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x41,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_phminposuw: +; SKX: ## BB#0: +; SKX-NEXT: vphminposuw %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x41,0xc0][5:1.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } @@ -183,8 +215,8 @@ ; ; SKX-LABEL: test_x86_sse41_pmaxsb: ; SKX: ## BB#0: -; SKX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3c,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SKX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3c,0xc1][1:0.50] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] ret <16 x i8> %res } @@ -204,8 +236,8 @@ ; ; SKX-LABEL: test_x86_sse41_pmaxsd: ; SKX: ## BB#0: -; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3d,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3d,0xc1][1:0.50] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -225,8 +257,8 @@ ; ; SKX-LABEL: test_x86_sse41_pmaxud: ; SKX: ## BB#0: -; SKX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3f,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SKX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3f,0xc1][1:0.50] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -246,8 +278,8 @@ ; ; SKX-LABEL: test_x86_sse41_pmaxuw: ; SKX: ## BB#0: -; SKX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SKX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc1][1:0.50] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } @@ -267,8 +299,8 @@ ; ; SKX-LABEL: test_x86_sse41_pminsb: ; SKX: ## BB#0: -; SKX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x38,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SKX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x38,0xc1][1:0.50] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] ret <16 x i8> %res } @@ -288,8 +320,8 @@ ; ; SKX-LABEL: test_x86_sse41_pminsd: ; SKX: ## BB#0: -; SKX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x39,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SKX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x39,0xc1][1:0.50] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -309,8 +341,8 @@ ; ; SKX-LABEL: test_x86_sse41_pminud: ; SKX: ## BB#0: -; SKX-NEXT: vpminud %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3b,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SKX-NEXT: vpminud %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3b,0xc1][1:0.50] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -330,8 +362,8 @@ ; ; SKX-LABEL: test_x86_sse41_pminuw: ; SKX: ## BB#0: -; SKX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SKX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xc1][1:0.50] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } @@ -351,8 +383,8 @@ ; ; SKX-LABEL: test_x86_sse41_pmuldq: ; SKX: ## BB#0: -; SKX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SKX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0xc1][5:1.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -367,12 +399,19 @@ ; SSE41-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_ptestc: -; VCHECK: ## BB#0: -; VCHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; VCHECK-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1] -; VCHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_ptestc: +; AVX2: ## BB#0: +; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX2-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1] +; AVX2-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_ptestc: +; SKX: ## BB#0: +; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; SKX-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1][2:1.00] +; SKX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0][1:0.50] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) ; [#uses=1] ret i32 %res } @@ -387,12 +426,19 @@ ; SSE41-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_ptestnzc: -; VCHECK: ## BB#0: -; VCHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; VCHECK-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1] -; VCHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_ptestnzc: +; AVX2: ## BB#0: +; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX2-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1] +; AVX2-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_ptestnzc: +; SKX: ## BB#0: +; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; SKX-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1][2:1.00] +; SKX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0][1:0.50] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1) ; [#uses=1] ret i32 %res } @@ -407,12 +453,19 @@ ; SSE41-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_ptestz: -; VCHECK: ## BB#0: -; VCHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; VCHECK-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1] -; VCHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_ptestz: +; AVX2: ## BB#0: +; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX2-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1] +; AVX2-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_ptestz: +; SKX: ## BB#0: +; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0][1:0.25] +; SKX-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1][2:1.00] +; SKX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0][1:0.50] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1) ; [#uses=1] ret i32 %res } @@ -425,10 +478,15 @@ ; SSE41-NEXT: roundpd $7, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x09,0xc0,0x07] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_round_pd: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vroundpd $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x09,0xc0,0x07] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_round_pd: +; AVX2: ## BB#0: +; AVX2-NEXT: vroundpd $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x09,0xc0,0x07] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_round_pd: +; SKX: ## BB#0: +; SKX-NEXT: vroundpd $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x09,0xc0,0x07][6:2.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1] ret <2 x double> %res } @@ -441,10 +499,15 @@ ; SSE41-NEXT: roundps $7, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x08,0xc0,0x07] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_round_ps: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vroundps $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x08,0xc0,0x07] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_round_ps: +; AVX2: ## BB#0: +; AVX2-NEXT: vroundps $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x08,0xc0,0x07] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_round_ps: +; SKX: ## BB#0: +; SKX-NEXT: vroundps $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x08,0xc0,0x07][6:2.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -457,10 +520,15 @@ ; SSE41-NEXT: roundsd $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0b,0xc1,0x07] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_round_sd: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0xc1,0x07] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_round_sd: +; AVX2: ## BB#0: +; AVX2-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0xc1,0x07] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_round_sd: +; SKX: ## BB#0: +; SKX-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0xc1,0x07][6:2.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1] ret <2 x double> %res } @@ -474,11 +542,17 @@ ; SSE41-NEXT: roundsd $7, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0b,0x00,0x07] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_round_sd_load: -; VCHECK: ## BB#0: -; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; VCHECK-NEXT: vroundsd $7, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0x00,0x07] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_round_sd_load: +; AVX2: ## BB#0: +; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; AVX2-NEXT: vroundsd $7, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0x00,0x07] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_round_sd_load: +; SKX: ## BB#0: +; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04][4:0.50] +; SKX-NEXT: vroundsd $7, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0x00,0x07][10:2.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %a1b = load <2 x double>, <2 x double>* %a1 %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1b, i32 7) ; <<2 x double>> [#uses=1] ret <2 x double> %res @@ -491,10 +565,15 @@ ; SSE41-NEXT: roundss $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0a,0xc1,0x07] ; SSE41-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse41_round_ss: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0a,0xc1,0x07] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse41_round_ss: +; AVX2: ## BB#0: +; AVX2-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0a,0xc1,0x07] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse41_round_ss: +; SKX: ## BB#0: +; SKX-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0a,0xc1,0x07][6:2.00] +; SKX-NEXT: retl ## encoding: [0xc3][1:1.00] %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } Index: test/CodeGen/X86/vec_fpext.ll =================================================================== --- test/CodeGen/X86/vec_fpext.ll +++ test/CodeGen/X86/vec_fpext.ll @@ -185,43 +185,37 @@ define <2 x double> @fpext_fromconst() { ; X32-SSE-LABEL: fpext_fromconst: ; X32-SSE: # BB#0: # %entry -; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00] -; X32-SSE-NEXT: # encoding: [0x0f,0x28,0x05,A,A,A,A] +; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]encoding: [0x0f,0x28,0x05,A,A,A,A] ; X32-SSE-NEXT: # fixup A - offset: 3, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X32-SSE-NEXT: retl # encoding: [0xc3] ; ; X32-AVX-LABEL: fpext_fromconst: ; X32-AVX: # BB#0: # %entry -; X32-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00] -; X32-AVX-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X32-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; X32-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X32-AVX-NEXT: retl # encoding: [0xc3] ; ; X32-AVX512VL-LABEL: fpext_fromconst: ; X32-AVX512VL: # BB#0: # %entry -; X32-AVX512VL-NEXT: vmovaps {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [1.000000e+00,-2.000000e+00] -; X32-AVX512VL-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X32-AVX512VL-NEXT: vmovaps {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [1.000000e+00,-2.000000e+00]encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; X32-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X32-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: fpext_fromconst: ; X64-SSE: # BB#0: # %entry -; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00] -; X64-SSE-NEXT: # encoding: [0x0f,0x28,0x05,A,A,A,A] +; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]encoding: [0x0f,0x28,0x05,A,A,A,A] ; X64-SSE-NEXT: # fixup A - offset: 3, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-SSE-NEXT: retq # encoding: [0xc3] ; ; X64-AVX-LABEL: fpext_fromconst: ; X64-AVX: # BB#0: # %entry -; X64-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00] -; X64-AVX-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: fpext_fromconst: ; X64-AVX512VL: # BB#0: # %entry -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [1.000000e+00,-2.000000e+00] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [1.000000e+00,-2.000000e+00]encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] entry: Index: test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v16.ll +++ test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -12,33 +12,33 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; SSE2: # BB#0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: retq +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; SSSE3: # BB#0: -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 # [1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pshufb %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSE41-NEXT: pshufb %xmm1, %xmm0 # [1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %xmm0 # [1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -46,32 +46,32 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: ; SSE2: # BB#0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: retq +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1][5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: ; SSE41: # BB#0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; SSE41-NEXT: retq +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1][5:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: ; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1][5:0.50] +; AVX1OR2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -79,34 +79,34 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: retq +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8][5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; SSE41: # BB#0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] -; SSE41-NEXT: retq +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8][5:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] -; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8][5:0.50] +; AVX1OR2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11][5:0.50] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -114,15 +114,15 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: ; SSE: # BB#0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: retq +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3][1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: ; AVX: # BB#0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; AVX-NEXT: retq +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -130,15 +130,15 @@ define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: ; SSE: # BB#0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: retq +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7][1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: ; AVX: # BB#0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX-NEXT: retq +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -146,28 +146,28 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: ; SSE2: # BB#0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] -; SSE2-NEXT: retq +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12][5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: ; SSE41: # BB#0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] -; SSE41-NEXT: retq +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12][5:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] -; AVX-NEXT: retq +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12][5:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -175,13 +175,13 @@ define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07: ; SSE: # BB#0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: retq +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07: ; AVX: # BB#0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: retq +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -189,20 +189,20 @@ define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_0101010101010101: ; SSE: # BB#0: -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: retq +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX1-LABEL: shuffle_v16i8_0101010101010101: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i8_0101010101010101: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %xmm0 # [1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -210,13 +210,13 @@ define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: ; SSE: # BB#0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: retq +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: ; AVX: # BB#0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: retq +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -224,13 +224,13 @@ define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31: ; SSE: # BB#0: -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: retq +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15][1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31: ; AVX: # BB#0: -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX-NEXT: retq +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -238,42 +238,42 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: pxor %xmm2, %xmm2 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 # [5:0.50] +; SSE2-NEXT: por %xmm2, %xmm0 # [1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: ; SSSE3: # BB#0: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 # [1:0.33] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: ; SSE41: # BB#0: -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; SSE41-NEXT: movdqa %xmm1, %xmm0 # [1:0.33] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: ; AVX1: # BB#0: -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: retq +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1 # [1:0.50] +; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -281,31 +281,31 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSE2-NEXT: movdqa %xmm0, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4][1:0.50] +; SSE2-NEXT: packuswb %xmm2, %xmm0 # [1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12][5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: ; SSE41: # BB#0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; SSE41-NEXT: retq +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12][5:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; AVX-NEXT: retq +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12][5:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -313,33 +313,33 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: pxor %xmm2, %xmm2 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4][1:0.50] +; SSE2-NEXT: packuswb %xmm1, %xmm0 # [1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: ; SSSE3: # BB#0: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9] -; SSSE3-NEXT: retq +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9][5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: ; SSE41: # BB#0: -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9] -; SSE41-NEXT: retq +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9][5:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: ; AVX: # BB#0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9] -; AVX-NEXT: retq +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9][5:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -347,44 +347,44 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3][1:0.50] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1][1:0.50] +; SSE2-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSE2-NEXT: movdqa %xmm0, %xmm2 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7][1:0.50] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7][1:0.50] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0][1:0.50] +; SSE2-NEXT: packuswb %xmm3, %xmm0 # [1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u][5:0.50] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u][5:0.50] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1][1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: ; SSE41: # BB#0: -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u] -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u] -; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE41-NEXT: retq +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u][5:0.50] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u][5:0.50] +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1][1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: retq +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u][5:0.50] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u][5:0.50] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -392,39 +392,40 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; SSE2: # BB#0: -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: orps %xmm2, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; SSE2-NEXT: andps %xmm2, %xmm0 # [1:0.33] +; SSE2-NEXT: andnps %xmm1, %xmm2 # [1:0.33] +; SSE2-NEXT: orps %xmm2, %xmm0 # [1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u][5:0.50] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u][5:0.50] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: movdqa %xmm0, %xmm2 # [1:0.33] +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 # [2:1.00] +; SSE41-NEXT: movdqa %xmm1, %xmm0 # [1:0.33] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1OR2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 # [2:1.00] +; AVX1OR2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -432,39 +433,40 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: ; SSE2: # BB#0: -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: orps %xmm2, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0][4:0.50] +; SSE2-NEXT: andps %xmm2, %xmm0 # [1:0.33] +; SSE2-NEXT: andnps %xmm1, %xmm2 # [1:0.33] +; SSE2-NEXT: orps %xmm2, %xmm0 # [1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[15] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[15][5:0.50] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero[5:0.50] +; SSSE3-NEXT: por %xmm1, %xmm0 # [1:0.33] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: movdqa %xmm0, %xmm2 # [1:0.33] +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0][4:0.50] +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 # [2:1.00] +; SSE41-NEXT: movdqa %xmm1, %xmm0 # [1:0.33] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: ; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0][4:0.50] +; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 # [2:1.00] +; AVX1OR2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-30584, %ax # imm = 0x8888 +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -472,18 +474,18 @@ define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(<16 x i8> %a) { ; SSE-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: ; SSE: # BB#0: -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 # [5:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: ; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX1OR2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 # [5:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -491,39 +493,40 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: ; SSE2: # BB#0: -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: orps %xmm2, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0][4:0.50] +; SSE2-NEXT: andps %xmm2, %xmm0 # [1:0.33] +; SSE2-NEXT: andnps %xmm1, %xmm2 # [1:0.33] +; SSE2-NEXT: orps %xmm2, %xmm0 # [1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15][5:0.50] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero[5:0.50] +; SSSE3-NEXT: por %xmm1, %xmm0 # [1:0.33] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: movdqa %xmm0, %xmm2 # [1:0.33] +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0][4:0.50] +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 # [2:1.00] +; SSE41-NEXT: movdqa %xmm1, %xmm0 # [1:0.33] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: ; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0][4:0.50] +; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 # [2:1.00] +; AVX1OR2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-28528, %ax # imm = 0x9090 +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -531,40 +534,41 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: ; SSE2: # BB#0: -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0] -; SSE2-NEXT: andps %xmm2, %xmm1 -; SSE2-NEXT: andnps %xmm0, %xmm2 -; SSE2-NEXT: orps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0][4:0.50] +; SSE2-NEXT: andps %xmm2, %xmm1 # [1:0.33] +; SSE2-NEXT: andnps %xmm0, %xmm2 # [1:0.33] +; SSE2-NEXT: orps %xmm1, %xmm2 # [1:0.33] +; SSE2-NEXT: movaps %xmm2, %xmm0 # [1:1.00] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[4,5,6,7],zero,zero,xmm0[10,11],zero,xmm0[13],zero,xmm0[15] -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9],zero,zero,xmm1[12],zero,xmm1[14],zero -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[4,5,6,7],zero,zero,xmm0[10,11],zero,xmm0[13],zero,xmm0[15][5:0.50] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9],zero,zero,xmm1[12],zero,xmm1[14],zero[5:0.50] +; SSSE3-NEXT: por %xmm1, %xmm0 # [1:0.33] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0] -; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: movdqa %xmm0, %xmm2 # [1:0.33] +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0][4:0.50] +; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 # [2:1.00] +; SSE41-NEXT: movdqa %xmm2, %xmm0 # [1:0.33] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1OR2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: ; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0][4:0.50] +; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # [2:1.00] +; AVX1OR2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-21264, %ax # imm = 0xACF0 +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -572,25 +576,25 @@ define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) { ; SSE2-LABEL: trunc_v4i32_shuffle: ; SSE2: # BB#0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 # [5:0.50] +; SSE2-NEXT: packuswb %xmm0, %xmm0 # [1:0.50] +; SSE2-NEXT: packuswb %xmm0, %xmm0 # [1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: trunc_v4i32_shuffle: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u][5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: trunc_v4i32_shuffle: ; SSE41: # BB#0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE41-NEXT: retq +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u][5:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: trunc_v4i32_shuffle: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u][5:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> ret <16 x i8> %shuffle } @@ -599,7 +603,7 @@ ; We don't have anything useful to check here. This generates 100s of ; instructions. Instead, just make sure we survived codegen. ; ALL-LABEL: stress_test0: -; ALL: retq +; ALL: retq # [5:1.00] entry: %s.1.4 = shufflevector <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i32> %s.1.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i32> @@ -621,7 +625,7 @@ ; ; ALL-LABEL: undef_test1: ; ALL: # BB#0: # %entry -; ALL-NEXT: retq +; ALL-NEXT: retq # [5:1.00] entry: %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> %s.2.4 = shufflevector <16 x i8> undef, <16 x i8> %s.0.5, <16 x i32> @@ -643,25 +647,25 @@ define <16 x i8> @PR20540(<8 x i8> %a) { ; SSE2-LABEL: PR20540: ; SSE2: # BB#0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE2-NEXT: retq +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 # [5:0.50] +; SSE2-NEXT: packuswb %xmm0, %xmm0 # [1:0.50] +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero[1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: PR20540: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero[5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: PR20540: ; SSE41: # BB#0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero[5:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: PR20540: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero[5:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -669,15 +673,15 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { ; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE: # BB#0: -; SSE-NEXT: movzbl %dil, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: movzbl %dil, %eax # [1:0.33] +; SSE-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: movzbl %dil, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: movzbl %dil, %eax # [1:0.33] +; AVX-NEXT: vmovd %eax, %xmm0 # [1:0.33] +; AVX-NEXT: retq # [5:1.00] %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle @@ -686,29 +690,29 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { ; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE2: # BB#0: -; SSE2-NEXT: shll $8, %edi -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pinsrw $2, %edi, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: shll $8, %edi # [1:0.50] +; SSE2-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSE2-NEXT: pinsrw $2, %edi, %xmm0 # [1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSSE3: # BB#0: -; SSSE3-NEXT: shll $8, %edi -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pinsrw $2, %edi, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-NEXT: shll $8, %edi # [1:0.50] +; SSSE3-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSSE3-NEXT: pinsrw $2, %edi, %xmm0 # [1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pinsrb $5, %edi, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSE41-NEXT: pinsrb $5, %edi, %xmm0 # [1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 # [1:0.33] +; AVX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 # [1:0.50] +; AVX-NEXT: retq # [5:1.00] %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle @@ -717,29 +721,29 @@ define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { ; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; SSE2: # BB#0: -; SSE2-NEXT: shll $8, %edi -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pinsrw $7, %edi, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: shll $8, %edi # [1:0.50] +; SSE2-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSE2-NEXT: pinsrw $7, %edi, %xmm0 # [1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; SSSE3: # BB#0: -; SSSE3-NEXT: shll $8, %edi -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pinsrw $7, %edi, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-NEXT: shll $8, %edi # [1:0.50] +; SSSE3-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSSE3-NEXT: pinsrw $7, %edi, %xmm0 # [1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pinsrb $15, %edi, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSE41-NEXT: pinsrb $15, %edi, %xmm0 # [1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 # [1:0.33] +; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 # [1:0.50] +; AVX-NEXT: retq # [5:1.00] %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle @@ -748,29 +752,29 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { ; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE2: # BB#0: -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pinsrw $1, %eax, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: movzbl %dil, %eax # [1:0.33] +; SSE2-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSE2-NEXT: pinsrw $1, %eax, %xmm0 # [1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSSE3: # BB#0: -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pinsrw $1, %eax, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-NEXT: movzbl %dil, %eax # [1:0.33] +; SSSE3-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSSE3-NEXT: pinsrw $1, %eax, %xmm0 # [1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pinsrb $2, %edi, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSE41-NEXT: pinsrb $2, %edi, %xmm0 # [1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 # [1:0.33] +; AVX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 # [1:0.50] +; AVX-NEXT: retq # [5:1.00] %a = insertelement <16 x i8> undef, i8 %i, i32 3 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle @@ -779,13 +783,13 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) { ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: ; SSE: # BB#0: -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; SSE-NEXT: retq +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3][1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: ; AVX: # BB#0: -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; AVX-NEXT: retq +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle } @@ -793,13 +797,13 @@ define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { ; SSE-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE: # BB#0: -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: retq +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle } @@ -807,25 +811,25 @@ define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: ; SSE2: # BB#0: -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[1:0.50] +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; SSE2-NEXT: por %xmm1, %xmm0 # [1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: ; SSSE3: # BB#0: -; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; SSSE3-NEXT: retq +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: ; SSE41: # BB#0: -; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; SSE41-NEXT: retq +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: ; AVX: # BB#0: -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX-NEXT: retq +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -833,26 +837,26 @@ define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa %xmm0, %xmm1 # [1:0.33] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[1:0.50] +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; SSE2-NEXT: por %xmm1, %xmm0 # [1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: ; SSSE3: # BB#0: -; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; SSSE3-NEXT: retq +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: ; SSE41: # BB#0: -; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; SSE41-NEXT: retq +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: ; AVX: # BB#0: -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX-NEXT: retq +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -860,25 +864,25 @@ define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: ; SSE2: # BB#0: -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero[1:0.50] +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0][1:0.50] +; SSE2-NEXT: por %xmm1, %xmm0 # [1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: ; SSSE3: # BB#0: -; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSSE3-NEXT: retq +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0][1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: ; SSE41: # BB#0: -; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSE41-NEXT: retq +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0][1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: ; AVX: # BB#0: -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; AVX-NEXT: retq +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -886,27 +890,27 @@ define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: ; SSE2: # BB#0: -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero[1:0.50] +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0][1:0.50] +; SSE2-NEXT: por %xmm1, %xmm0 # [1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: ; SSSE3: # BB#0: -; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0][1:0.50] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 # [1:0.33] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: ; SSE41: # BB#0: -; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0][1:0.50] +; SSE41-NEXT: movdqa %xmm1, %xmm0 # [1:0.33] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: ; AVX: # BB#0: -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX-NEXT: retq +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -914,26 +918,26 @@ define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa %xmm0, %xmm1 # [1:0.33] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero[1:0.50] +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0][1:0.50] +; SSE2-NEXT: por %xmm1, %xmm0 # [1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: ; SSSE3: # BB#0: -; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] -; SSSE3-NEXT: retq +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0][1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: ; SSE41: # BB#0: -; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] -; SSE41-NEXT: retq +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0][1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: ; AVX: # BB#0: -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] -; AVX-NEXT: retq +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -941,27 +945,27 @@ define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; SSE2: # BB#0: -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[1:0.50] +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; SSE2-NEXT: por %xmm1, %xmm0 # [1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; SSSE3: # BB#0: -; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 # [1:0.33] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; SSE41: # BB#0: -; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; SSE41-NEXT: movdqa %xmm1, %xmm0 # [1:0.33] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; AVX: # BB#0: -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX-NEXT: retq +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -970,34 +974,34 @@ define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(<16 x i8> %val1, <16 x i8> %val2) { ; SSE2-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: ; SSE2: # BB#0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,1,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,1,3][1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3][1:0.50] +; SSE2-NEXT: pand %xmm1, %xmm0 # [1:0.33] +; SSE2-NEXT: pandn %xmm2, %xmm1 # [1:0.33] +; SSE2-NEXT: por %xmm0, %xmm1 # [1:0.33] +; SSE2-NEXT: movdqa %xmm1, %xmm0 # [1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: ; SSSE3: # BB#0: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-NEXT: retq +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3][1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: ; SSE41: # BB#0: -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE41-NEXT: retq +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3][1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: ; AVX: # BB#0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX-NEXT: retq +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %val1, <16 x i8> %val2, <16 x i32> ret <16 x i8> %shuffle } @@ -1005,25 +1009,25 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) { ; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: ; SSE2: # BB#0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: retq +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero[5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero[1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero[1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1031,26 +1035,26 @@ define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { ; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq +; SSE2-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero[5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero[1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero[1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1058,25 +1062,25 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) { ; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: ; SSE2: # BB#0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: retq +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: ; SSSE3: # BB#0: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: retq +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3][1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: retq +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero[1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX-NEXT: retq +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero[1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1084,27 +1088,27 @@ define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) { ; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: retq +; SSE2-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: ; SSSE3: # BB#0: -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: retq +; SSSE3-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: retq +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero[1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX-NEXT: retq +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero[1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1112,23 +1116,23 @@ define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) { ; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: ; SSE2: # BB#0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: retq +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: ; SSSE3: # BB#0: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: retq +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: retq +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero[1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: retq +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero[1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1136,25 +1140,25 @@ define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) { ; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: retq +; SSE2-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: ; SSSE3: # BB#0: -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: retq +; SSSE3-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: retq +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero[1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: retq +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero[1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1162,56 +1166,56 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: psrlq $16, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,1,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: pxor %xmm2, %xmm2 # [1:0.33] +; SSE2-NEXT: movdqa %xmm0, %xmm3 # [1:0.33] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,0,1][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7][1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535][4:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4][1:0.50] +; SSE2-NEXT: pand %xmm5, %xmm2 # [1:0.33] +; SSE2-NEXT: pandn %xmm4, %xmm5 # [1:0.33] +; SSE2-NEXT: por %xmm2, %xmm5 # [1:0.33] +; SSE2-NEXT: psrlq $16, %xmm3 # [1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,1,3][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3][1:0.50] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1][1:0.50] +; SSE2-NEXT: packuswb %xmm5, %xmm2 # [1:0.50] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255][4:0.50] +; SSE2-NEXT: pand %xmm0, %xmm2 # [1:0.33] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,1,1,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; SSE2-NEXT: pandn %xmm1, %xmm0 # [1:0.33] +; SSE2-NEXT: por %xmm2, %xmm0 # [1:0.33] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero[5:0.50] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0][5:0.50] +; SSSE3-NEXT: por %xmm1, %xmm0 # [1:0.33] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero[5:0.50] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0][5:0.50] +; SSE41-NEXT: por %xmm1, %xmm0 # [1:0.33] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero[5:0.50] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0][5:0.50] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 # [1:0.33] +; AVX-NEXT: retq # [5:1.00] entry: %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -1221,7 +1225,7 @@ define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) { ; Nothing interesting to test here. Just make sure we didn't crashe. ; ALL-LABEL: stress_test2: -; ALL: retq +; ALL: retq # [5:1.00] entry: %s.1.0 = shufflevector <16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i32> %s.1.1 = shufflevector <16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i32> @@ -1233,24 +1237,24 @@ define void @constant_gets_selected(<4 x i32>* %ptr1, <4 x i32>* %ptr2) { ; SSE-LABEL: constant_gets_selected: ; SSE: # BB#0: # %entry -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, (%rdi) -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: retq +; SSE-NEXT: xorps %xmm0, %xmm0 # [1:0.33] +; SSE-NEXT: movaps %xmm0, (%rdi) # [1:1.00] +; SSE-NEXT: movaps %xmm0, (%rsi) # [1:1.00] +; SSE-NEXT: retq # [5:1.00] ; ; AVX1OR2-LABEL: constant_gets_selected: ; AVX1OR2: # BB#0: # %entry -; AVX1OR2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vmovaps %xmm0, (%rdi) -; AVX1OR2-NEXT: vmovaps %xmm0, (%rsi) -; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: vxorps %xmm0, %xmm0, %xmm0 # [1:0.33] +; AVX1OR2-NEXT: vmovaps %xmm0, (%rdi) # [1:1.00] +; AVX1OR2-NEXT: vmovaps %xmm0, (%rsi) # [1:1.00] +; AVX1OR2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: constant_gets_selected: ; AVX512VL: # BB#0: # %entry -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi) -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 # [1:0.33] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi) # [1:1.00] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) # [1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] entry: %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8> %shuffle.i = shufflevector <16 x i8> , <16 x i8> %weird_zero, <16 x i32> @@ -1267,13 +1271,13 @@ define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: ; SSE: # BB#0: -; SSE-NEXT: psllw $8, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: psllw $8, %xmm0 # [1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: ; AVX: # BB#0: -; AVX-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: vpsllw $8, %xmm0, %xmm0 # [1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1281,13 +1285,13 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: ; SSE: # BB#0: -; SSE-NEXT: pslld $24, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: pslld $24, %xmm0 # [1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: ; AVX: # BB#0: -; AVX-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: vpslld $24, %xmm0, %xmm0 # [1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1295,13 +1299,13 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08: ; SSE: # BB#0: -; SSE-NEXT: psllq $56, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: psllq $56, %xmm0 # [1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08: ; AVX: # BB#0: -; AVX-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: vpsllq $56, %xmm0, %xmm0 # [1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1309,13 +1313,13 @@ define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14: ; SSE: # BB#0: -; SSE-NEXT: psllq $8, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: psllq $8, %xmm0 # [1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14: ; AVX: # BB#0: -; AVX-NEXT: vpsllq $8, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: vpsllq $8, %xmm0, %xmm0 # [1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1323,13 +1327,13 @@ define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz: ; SSE: # BB#0: -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: psrlw $8, %xmm0 # [1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz: ; AVX: # BB#0: -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 # [1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1337,13 +1341,13 @@ define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz: ; SSE: # BB#0: -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: psrld $16, %xmm0 # [1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 # [1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1351,13 +1355,13 @@ define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz: ; SSE: # BB#0: -; SSE-NEXT: psrlq $56, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: psrlq $56, %xmm0 # [1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vpsrlq $56, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: vpsrlq $56, %xmm0, %xmm0 # [1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1365,43 +1369,43 @@ define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) { ; SSE2-LABEL: PR12412: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255][4:0.50] +; SSE2-NEXT: pand %xmm2, %xmm1 # [1:0.33] +; SSE2-NEXT: pand %xmm2, %xmm0 # [1:0.33] +; SSE2-NEXT: packuswb %xmm1, %xmm0 # [1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: PR12412: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>[4:0.50] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 # [1:0.50] +; SSSE3-NEXT: pshufb %xmm2, %xmm0 # [1:0.50] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0][1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: PR12412: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: retq +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>[4:0.50] +; SSE41-NEXT: pshufb %xmm2, %xmm1 # [1:0.50] +; SSE41-NEXT: pshufb %xmm2, %xmm0 # [1:0.50] +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0][1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1OR2-LABEL: PR12412: ; AVX1OR2: # BB#0: # %entry -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>[4:0.50] +; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0][1:0.50] +; AVX1OR2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: PR12412: ; AVX512VL: # BB#0: # %entry -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>[4:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0][1:0.50] +; AVX512VL-NEXT: retq # [5:1.00] entry: %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> ret <16 x i8> %0 @@ -1410,13 +1414,13 @@ define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz(<16 x i8> %a) { ; SSE-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz: ; SSE: # BB#0: -; SSE-NEXT: psrld $8, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: psrld $8, %xmm0 # [1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz: ; AVX: # BB#0: -; AVX-NEXT: vpsrld $8, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: vpsrld $8, %xmm0, %xmm0 # [1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1424,13 +1428,13 @@ define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_bitcast_unpack: ; SSE: # BB#0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: retq +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSE-NEXT: retq # [5:1.00] ; ; AVX-LABEL: shuffle_v16i8_bitcast_unpack: ; AVX: # BB#0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: retq +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX-NEXT: retq # [5:1.00] %shuffle8 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> %bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float> %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> @@ -1443,37 +1447,37 @@ define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) { ; SSE2-LABEL: insert_dup_mem_v16i8_i32: ; SSE2: # BB#0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: retq +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: insert_dup_mem_v16i8_i32: ; SSSE3: # BB#0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; SSSE3-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 # [1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: insert_dup_mem_v16i8_i32: ; SSE41: # BB#0: -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pshufb %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; SSE41-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSE41-NEXT: pshufb %xmm1, %xmm0 # [1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1-LABEL: insert_dup_mem_v16i8_i32: ; AVX1: # BB#0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_i32: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0 # [4:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> @@ -1484,41 +1488,41 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) { ; SSE2-LABEL: insert_dup_mem_v16i8_sext_i8: ; SSE2: # BB#0: -; SSE2-NEXT: movsbl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: retq +; SSE2-NEXT: movsbl (%rdi), %eax # [5:0.50] +; SSE2-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8: ; SSSE3: # BB#0: -; SSSE3-NEXT: movsbl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-NEXT: movsbl (%rdi), %eax # [5:0.50] +; SSSE3-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSSE3-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 # [1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: insert_dup_mem_v16i8_sext_i8: ; SSE41: # BB#0: -; SSE41-NEXT: movsbl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pshufb %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE41-NEXT: movsbl (%rdi), %eax # [5:0.50] +; SSE41-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSE41-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSE41-NEXT: pshufb %xmm1, %xmm0 # [1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1-LABEL: insert_dup_mem_v16i8_sext_i8: ; AVX1: # BB#0: -; AVX1-NEXT: movsbl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq +; AVX1-NEXT: movsbl (%rdi), %eax # [5:0.50] +; AVX1-NEXT: vmovd %eax, %xmm0 # [1:0.33] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_sext_i8: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0 # [4:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %tmp = load i8, i8* %ptr, align 1 %tmp1 = sext i8 %tmp to i32 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 @@ -1530,34 +1534,34 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) { ; SSE2-LABEL: insert_dup_elt1_mem_v16i8_i32: ; SSE2: # BB#0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: retq +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32: ; SSSE3: # BB#0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSSE3-NEXT: retq +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1][5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: insert_dup_elt1_mem_v16i8_i32: ; SSE41: # BB#0: -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: retq +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1][5:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1-LABEL: insert_dup_elt1_mem_v16i8_i32: ; AVX1: # BB#0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: retq +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1][5:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i8_i32: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastb 1(%rdi), %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastb 1(%rdi), %xmm0 # [4:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> @@ -1568,34 +1572,34 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) { ; SSE2-LABEL: insert_dup_elt2_mem_v16i8_i32: ; SSE2: # BB#0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: retq +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32: ; SSSE3: # BB#0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; SSSE3-NEXT: retq +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2][5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: insert_dup_elt2_mem_v16i8_i32: ; SSE41: # BB#0: -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; SSE41-NEXT: retq +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2][5:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1-LABEL: insert_dup_elt2_mem_v16i8_i32: ; AVX1: # BB#0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; AVX1-NEXT: retq +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2][5:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: insert_dup_elt2_mem_v16i8_i32: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastb 2(%rdi), %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastb 2(%rdi), %xmm0 # [4:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> @@ -1606,48 +1610,48 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) { ; SSE2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: ; SSE2: # BB#0: -; SSE2-NEXT: movsbl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: retq +; SSE2-NEXT: movsbl (%rdi), %eax # [5:0.50] +; SSE2-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: ; SSSE3: # BB#0: -; SSSE3-NEXT: movsbl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSSE3-NEXT: retq +; SSSE3-NEXT: movsbl (%rdi), %eax # [5:0.50] +; SSSE3-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1][5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: ; SSE41: # BB#0: -; SSE41-NEXT: movsbl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: retq +; SSE41-NEXT: movsbl (%rdi), %eax # [5:0.50] +; SSE41-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1][5:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: ; AVX1: # BB#0: -; AVX1-NEXT: movsbl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: retq +; AVX1-NEXT: movsbl (%rdi), %eax # [5:0.50] +; AVX1-NEXT: vmovd %eax, %xmm0 # [1:0.33] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1][5:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: ; AVX2: # BB#0: -; AVX2-NEXT: movsbl (%rdi), %eax -; AVX2-NEXT: shrl $8, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX2-NEXT: movsbl (%rdi), %eax # [5:0.50] +; AVX2-NEXT: shrl $8, %eax # [1:0.50] +; AVX2-NEXT: vmovd %eax, %xmm0 # [1:0.33] +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: movsbl (%rdi), %eax -; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: movsbl (%rdi), %eax # [5:0.50] +; AVX512VL-NEXT: shrl $8, %eax # [1:0.50] ; AVX512VL-NEXT: vpbroadcastb %al, %xmm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %tmp = load i8, i8* %ptr, align 1 %tmp1 = sext i8 %tmp to i32 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 @@ -1659,48 +1663,48 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) { ; SSE2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: ; SSE2: # BB#0: -; SSE2-NEXT: movsbl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: retq +; SSE2-NEXT: movsbl (%rdi), %eax # [5:0.50] +; SSE2-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: ; SSSE3: # BB#0: -; SSSE3-NEXT: movsbl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; SSSE3-NEXT: retq +; SSSE3-NEXT: movsbl (%rdi), %eax # [5:0.50] +; SSSE3-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2][5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: ; SSE41: # BB#0: -; SSE41-NEXT: movsbl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; SSE41-NEXT: retq +; SSE41-NEXT: movsbl (%rdi), %eax # [5:0.50] +; SSE41-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2][5:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: ; AVX1: # BB#0: -; AVX1-NEXT: movsbl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; AVX1-NEXT: retq +; AVX1-NEXT: movsbl (%rdi), %eax # [5:0.50] +; AVX1-NEXT: vmovd %eax, %xmm0 # [1:0.33] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2][5:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: ; AVX2: # BB#0: -; AVX2-NEXT: movsbl (%rdi), %eax -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX2-NEXT: movsbl (%rdi), %eax # [5:0.50] +; AVX2-NEXT: shrl $16, %eax # [1:0.50] +; AVX2-NEXT: vmovd %eax, %xmm0 # [1:0.33] +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: movsbl (%rdi), %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movsbl (%rdi), %eax # [5:0.50] +; AVX512VL-NEXT: shrl $16, %eax # [1:0.50] ; AVX512VL-NEXT: vpbroadcastb %al, %xmm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %tmp = load i8, i8* %ptr, align 1 %tmp1 = sext i8 %tmp to i32 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 @@ -1712,49 +1716,49 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b) { ; SSE2-LABEL: PR31364: ; SSE2: # BB#0: -; SSE2-NEXT: movzbl (%rdi), %eax -; SSE2-NEXT: movzbl (%rsi), %ecx -; SSE2-NEXT: shll $8, %ecx -; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: movzwl %cx, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,7] -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: movzbl (%rdi), %eax # [5:0.50] +; SSE2-NEXT: movzbl (%rsi), %ecx # [5:0.50] +; SSE2-NEXT: shll $8, %ecx # [1:0.50] +; SSE2-NEXT: orl %eax, %ecx # [1:0.33] +; SSE2-NEXT: movzwl %cx, %eax # [1:0.33] +; SSE2-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSE2-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,3][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,7][1:0.50] +; SSE2-NEXT: packuswb %xmm1, %xmm0 # [1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: PR31364: ; SSSE3: # BB#0: -; SSSE3-NEXT: movzbl (%rdi), %eax -; SSSE3-NEXT: movzbl (%rsi), %ecx -; SSSE3-NEXT: shll $8, %ecx -; SSSE3-NEXT: orl %eax, %ecx -; SSSE3-NEXT: movzwl %cx, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] -; SSSE3-NEXT: retq +; SSSE3-NEXT: movzbl (%rdi), %eax # [5:0.50] +; SSSE3-NEXT: movzbl (%rsi), %ecx # [5:0.50] +; SSSE3-NEXT: shll $8, %ecx # [1:0.50] +; SSSE3-NEXT: orl %eax, %ecx # [1:0.33] +; SSSE3-NEXT: movzwl %cx, %eax # [1:0.33] +; SSSE3-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0][5:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: PR31364: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pinsrb $0, (%rdi), %xmm0 -; SSE41-NEXT: pinsrb $1, (%rsi), %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] -; SSE41-NEXT: retq +; SSE41-NEXT: pxor %xmm0, %xmm0 # [1:0.33] +; SSE41-NEXT: pinsrb $0, (%rdi), %xmm0 # [5:0.50] +; SSE41-NEXT: pinsrb $1, (%rsi), %xmm0 # [5:0.50] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0][5:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX-LABEL: PR31364: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] -; AVX-NEXT: retq +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 # [1:0.33] +; AVX-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0 # [5:0.50] +; AVX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0 # [5:0.50] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0][5:0.50] +; AVX-NEXT: retq # [5:1.00] %v0 = load i8, i8* %a, align 1 %vecins = insertelement <16 x i8> , i8 %v0, i32 0 %v1 = load i8, i8* %b, align 1 @@ -1766,61 +1770,61 @@ define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; SSE2-LABEL: PR31301: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movzbl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: movzbl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: retq +; SSE2-NEXT: movzbl (%rdi), %eax # [5:0.50] +; SSE2-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; SSE2-NEXT: movzbl (%rsi), %eax # [5:0.50] +; SSE2-NEXT: movd %eax, %xmm1 # [1:0.33] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; SSE2-NEXT: retq # [5:1.00] ; ; SSSE3-LABEL: PR31301: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movzbl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: movzbl (%rsi), %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: retq +; SSSE3-NEXT: movzbl (%rdi), %eax # [5:0.50] +; SSSE3-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSSE3-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 # [1:0.50] +; SSSE3-NEXT: movzbl (%rsi), %eax # [5:0.50] +; SSSE3-NEXT: movd %eax, %xmm2 # [1:0.33] +; SSSE3-NEXT: pshufb %xmm1, %xmm2 # [1:0.50] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; SSSE3-NEXT: retq # [5:1.00] ; ; SSE41-LABEL: PR31301: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movzbl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pshufb %xmm1, %xmm0 -; SSE41-NEXT: movzbl (%rsi), %eax -; SSE41-NEXT: movd %eax, %xmm2 -; SSE41-NEXT: pshufb %xmm1, %xmm2 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE41-NEXT: retq +; SSE41-NEXT: movzbl (%rdi), %eax # [5:0.50] +; SSE41-NEXT: movd %eax, %xmm0 # [1:0.33] +; SSE41-NEXT: pxor %xmm1, %xmm1 # [1:0.33] +; SSE41-NEXT: pshufb %xmm1, %xmm0 # [1:0.50] +; SSE41-NEXT: movzbl (%rsi), %eax # [5:0.50] +; SSE41-NEXT: movd %eax, %xmm2 # [1:0.33] +; SSE41-NEXT: pshufb %xmm1, %xmm2 # [1:0.50] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; SSE41-NEXT: retq # [5:1.00] ; ; AVX1-LABEL: PR31301: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: movzbl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: movzbl (%rsi), %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: retq +; AVX1-NEXT: movzbl (%rdi), %eax # [5:0.50] +; AVX1-NEXT: vmovd %eax, %xmm0 # [1:0.33] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: movzbl (%rsi), %eax # [5:0.50] +; AVX1-NEXT: vmovd %eax, %xmm2 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 # [1:0.50] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: PR31301: ; AVX2OR512VL: # BB#0: # %entry -; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0 -; AVX2OR512VL-NEXT: vpbroadcastb (%rsi), %xmm1 -; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0 # [4:0.50] +; AVX2OR512VL-NEXT: vpbroadcastb (%rsi), %xmm1 # [4:0.50] +; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] entry: %0 = load i8, i8* %x, align 1 %1 = insertelement <16 x i8> undef, i8 %0, i32 0 Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -8,15 +8,15 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -24,17 +24,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -42,17 +42,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -60,17 +60,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -78,17 +78,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -96,17 +96,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -114,17 +114,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -132,17 +132,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -150,29 +150,29 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3][5:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 # [1:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1][1:1.00] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5][1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255][4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 # [2:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0][4:0.50] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -180,27 +180,27 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,6,7,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,6,7,0,1][5:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: retq +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u>[4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # [2:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0][4:0.50] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -208,26 +208,26 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1][5:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: retq +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0][4:0.50] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -235,26 +235,26 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: retq +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0][4:0.50] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -262,25 +262,25 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0][4:0.50] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -288,25 +288,25 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0][4:0.50] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -314,25 +314,25 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0][4:0.50] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -340,26 +340,26 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: movl $15, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 +; AVX512VL-NEXT: movl $15, %eax # [1:0.33] +; AVX512VL-NEXT: vmovd %eax, %xmm1 # [1:0.33] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -367,19 +367,19 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -387,19 +387,19 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,7,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -407,19 +407,19 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15][1:0.50] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -427,19 +427,19 @@ define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15][1:0.50] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -447,19 +447,19 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15][1:0.50] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -467,19 +467,19 @@ define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,7,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,7,7][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15][1:0.50] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -487,21 +487,21 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1][5:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1][5:0.50] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -509,21 +509,21 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1][5:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1][5:0.50] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -531,21 +531,21 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1][5:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1][5:0.50] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -553,21 +553,21 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1][5:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1][5:0.50] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -575,21 +575,21 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -597,21 +597,21 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -619,21 +619,21 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -641,16 +641,16 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; AVX1: # BB#0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0] -; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0][4:0.50] +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 # [1:0.33] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 # [1:0.33] +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 # [1:0.33] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -658,16 +658,16 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15: ; AVX1: # BB#0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0] -; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0][4:0.50] +; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 # [1:0.33] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 # [1:0.33] +; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 # [1:0.33] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -675,13 +675,13 @@ define <16 x i16> @shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31: ; AVX1: # BB#0: -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-NEXT: retq +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7][1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -689,13 +689,13 @@ define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15: ; AVX1: # BB#0: -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] -; AVX1-NEXT: retq +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3][1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -703,24 +703,25 @@ define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: ; AVX1: # BB#0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0] -; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0][4:0.50] +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 # [1:0.33] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 # [1:0.33] +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 # [1:0.33] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0][4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 # [2:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-32768, %ax # imm = 0x8000 +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -728,24 +729,24 @@ define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: ; AVX1: # BB#0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535][4:0.50] +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 # [1:0.33] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 # [1:0.33] +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 # [1:0.33] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255][4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 # [2:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: movw $1, %ax +; AVX512VL-NEXT: movw $1, %ax # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -753,24 +754,25 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: ; AVX1: # BB#0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535] -; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535][4:0.50] +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 # [1:0.33] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 # [1:0.33] +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 # [1:0.33] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255][4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 # [2:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $21930, %ax # imm = 0x55AA +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -778,24 +780,25 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: ; AVX1: # BB#0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0] -; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0][4:0.50] +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 # [1:0.33] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 # [1:0.33] +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 # [1:0.33] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0][4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 # [2:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movw $-21931, %ax # imm = 0xAA55 +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -803,13 +806,13 @@ define <16 x i16> @shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31: ; AVX1: # BB#0: -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-NEXT: retq +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7][1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -817,16 +820,16 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16: ; AVX1: # BB#0: -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2OR512VL-NEXT: vpbroadcastd %xmm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX2OR512VL-NEXT: vpbroadcastd %xmm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -834,28 +837,28 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: ; AVX1: # BB#0: -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4][1:0.50] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5][1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -863,27 +866,27 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15: ; AVX2: # BB#0: -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15][1:0.50] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31][4:0.50] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -891,32 +894,32 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7][1:0.50] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28][4:0.50] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -924,30 +927,30 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,8,9,4,5,0,1,14,15,10,11,6,7,2,3] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3][1:0.50] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,8,9,4,5,0,1,14,15,10,11,6,7,2,3][4:0.50] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 # [1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3][1:0.50] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: ; AVX2: # BB#0: -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12][1:0.50] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24][4:0.50] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -955,17 +958,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -973,17 +976,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -991,17 +994,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1009,17 +1012,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1027,17 +1030,17 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1045,17 +1048,17 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1063,17 +1066,17 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1081,17 +1084,17 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1099,17 +1102,17 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1117,25 +1120,25 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31][5:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u][5:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1143,25 +1146,25 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3][1:0.50] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23][5:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u][5:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1169,16 +1172,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1186,16 +1189,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1203,16 +1206,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1220,16 +1223,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1237,16 +1240,16 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1254,16 +1257,16 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1271,16 +1274,16 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1288,17 +1291,17 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,8,9,8,9,4,5,4,5,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,8,9,8,9,4,5,4,5,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1306,17 +1309,17 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1324,17 +1327,17 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1342,16 +1345,16 @@ define <16 x i16> @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1359,17 +1362,17 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,12,13,8,9,4,5,4,5,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,12,13,8,9,4,5,4,5,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1377,18 +1380,18 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1396,25 +1399,25 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: ; AVX2: # BB#0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: retq +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1422,26 +1425,26 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: retq +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1][1:1.00] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1449,27 +1452,27 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: retq +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3][1:1.00] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1477,26 +1480,26 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7][1:0.50] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1504,23 +1507,23 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: ; AVX1: # BB#0: -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: ; AVX2: # BB#0: -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1528,16 +1531,16 @@ define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24: ; AVX1: # BB#0: -; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> ret <16 x i16> %shuffle } @@ -1545,16 +1548,16 @@ define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero[1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero[1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero[1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> ret <16 x i16> %shuffle } @@ -1566,16 +1569,16 @@ define <16 x i16> @shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: ; AVX1: # BB#0: -; AVX1-NEXT: vpslld $16, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpslld $16, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpslld $16, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpslld $16, %ymm0, %ymm0 # [1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> ret <16 x i16> %shuffle } @@ -1583,16 +1586,16 @@ define <16 x i16> @shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: ; AVX1: # BB#0: -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpsllq $48, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpsllq $48, %ymm0, %ymm0 # [1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> ret <16 x i16> %shuffle } @@ -1600,16 +1603,16 @@ define <16 x i16> @shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpsrld $16, %ymm0, %ymm0 # [1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> ret <16 x i16> %shuffle } @@ -1617,15 +1620,15 @@ define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-NEXT: retq +; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 # [1:0.33] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7][1:1.00] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7][1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 # [1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> ret <16 x i16> %shuffle } @@ -1633,16 +1636,16 @@ define <16 x i16> @shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero[1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3][1:0.50] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero[1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero[1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> ret <16 x i16> %shuffle } @@ -1650,16 +1653,16 @@ define <16 x i16> @shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero[1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero[1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> ret <16 x i16> %shuffle } @@ -1667,33 +1670,33 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1][1:0.50] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero[1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3][1:0.50] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero[1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1][1:0.50] +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero[1:0.50] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3][1:0.50] +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero[1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 # [1:0.33] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15] -; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15][4:0.50] +; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 # [1:0.33] ; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> ret <16 x i16> %shuffle } @@ -1701,17 +1704,17 @@ define <16 x i16> @shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm1[30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm1[30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1719,17 +1722,17 @@ define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1737,17 +1740,17 @@ define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1755,17 +1758,17 @@ define <16 x i16> @shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm0[30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm0[30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1773,22 +1776,22 @@ define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16: ; AVX1: # BB#0: -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16: ; AVX2: # BB#0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16,17] -; AVX2-NEXT: retq +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16,17][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1796,22 +1799,22 @@ define <16 x i16> @shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22: ; AVX1: # BB#0: -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22: ; AVX2: # BB#0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,31,16,17,18,19,20,21,22,23,24,25,26,27,28,29] -; AVX2-NEXT: retq +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,31,16,17,18,19,20,21,22,23,24,25,26,27,28,29][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1819,23 +1822,23 @@ define <16 x i16> @shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1843,21 +1846,21 @@ define <16 x i16> @shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1865,30 +1868,30 @@ define <16 x i16> @shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15][5:0.50] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7][1:0.50] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1][1:0.50] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1896,22 +1899,22 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3][5:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] -; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3][5:0.50] +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm1 # [1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1919,27 +1922,27 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7][1:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1947,23 +1950,23 @@ define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1971,23 +1974,23 @@ define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1995,25 +1998,25 @@ define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2][1:0.50] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2021,23 +2024,23 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15][5:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1][5:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2045,21 +2048,21 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2067,25 +2070,25 @@ define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2][1:0.50] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2093,23 +2096,23 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2][1:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3][1:0.50] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2117,33 +2120,33 @@ define <16 x i16> @shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1][4:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1][4:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2151,23 +2154,23 @@ define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3][5:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1][5:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2175,23 +2178,23 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3][5:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1][5:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2199,23 +2202,23 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3][5:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9][5:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2223,23 +2226,23 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3][5:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1][5:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2247,23 +2250,23 @@ define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3][5:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9][5:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2271,33 +2274,33 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7][4:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7][4:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2305,33 +2308,33 @@ define <16 x i16> @shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7][4:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7][4:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2339,33 +2342,33 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7][1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11][4:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7][1:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11][4:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7][1:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2373,33 +2376,33 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7][4:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3][1:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7][4:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3][1:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2407,23 +2410,23 @@ define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15][5:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9][5:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2431,23 +2434,23 @@ define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15][5:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9][5:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2455,23 +2458,23 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15][5:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9][5:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2479,23 +2482,23 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15][5:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2503,25 +2506,25 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3][1:0.50] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3][1:0.50] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2529,23 +2532,23 @@ define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15][5:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9][5:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2553,23 +2556,23 @@ define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15][5:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9][5:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2577,23 +2580,23 @@ define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 # [1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15][5:0.50] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9][5:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2601,24 +2604,24 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15,16,17,18,19,20,21,30,31,20,21,30,31,28,29,30,31] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15,16,17,18,19,20,21,30,31,20,21,30,31,28,29,30,31][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,7,6,7,8,9,10,11,12,15,14,15] -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7][1:0.50] +; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,7,6,7,8,9,10,11,12,15,14,15][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7][1:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2626,32 +2629,32 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm2 # [1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7][5:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm2 # [1:0.50] +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0][1:0.50] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0][1:0.50] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2659,24 +2662,24 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3,24,25,26,27,28,29,22,23,24,25,26,27,16,17,18,19] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3,24,25,26,27,28,29,22,23,24,25,26,27,16,17,18,19][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,2,0,7,5,6,4] -; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,3,4,5,6,7,8,11,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,2,0,7,5,6,4][1:0.50] +; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,3,4,5,6,7,8,11,10,11,12,13,14,15][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7][1:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2684,16 +2687,16 @@ define <16 x i16> @shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 # [1:0.33] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7][1:0.50] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3][1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[8,9,u,u,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[8,9,u,u,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> ret <16 x i16> %shuffle } @@ -2701,36 +2704,36 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7][4:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,2] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,1,2] -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3][1:0.50] +; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,2][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3][1:0.50] +; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,1,2][1:0.50] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2738,32 +2741,32 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3][5:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7][1:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15][5:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,3,1] -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3][5:0.50] +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0][1:0.50] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,3,1][1:0.50] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2771,33 +2774,33 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7][1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11][4:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3][1:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11][4:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 # [1:0.50] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3][1:0.50] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2805,32 +2808,32 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15][5:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3][1:0.50] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3][1:0.50] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 # [1:1.00] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11][1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2838,32 +2841,32 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3][1:0.50] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15][5:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15] -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm0[7] -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7][1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15][4:0.50] +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm2 # [1:0.50] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm0[7][1:0.50] +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2871,32 +2874,32 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15][5:0.50] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7][1:0.50] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 # [1:1.00] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15][1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2904,32 +2907,32 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15][5:0.50] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7][1:0.50] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7][1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7][4:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2937,39 +2940,39 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15][5:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7][1:0.50] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3][1:0.50] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # [1:1.00] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15][1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -2977,36 +2980,36 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,0,2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,0,1,2,3,2,3,0,1,12,13,2,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,0,2,3][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,0,1,2,3,2,3,0,1,12,13,2,3][5:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3][4:0.50] +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4 # [1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 # [1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 # [1:1.00] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15][1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3014,35 +3017,35 @@ define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,12,13,10,11,8,9,10,11,12,13,10,11] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,12,13,10,11,8,9,10,11,12,13,10,11][5:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,8,9,10,11,6,7,4,5] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,4,5,6,7,6,7,4,5,4,5,6,7,18,19,16,17,20,21,22,23,22,23,20,21,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3][1:0.50] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5][1:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,8,9,10,11,6,7,4,5][5:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # [1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,4,5,6,7,6,7,4,5,4,5,6,7,18,19,16,17,20,21,22,23,22,23,20,21,20,21,22,23][5:0.50] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3050,33 +3053,33 @@ define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15][5:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3][1:0.50] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3][1:0.50] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11][1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27][4:0.50] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3084,33 +3087,33 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15][5:0.50] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15][1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31][4:0.50] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3118,36 +3121,36 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,4,5,14,15,0,1,4,5,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0][1:0.50] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,4,5,14,15,0,1,4,5,4,5,6,7][5:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7][1:0.50] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7][1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7][1:0.50] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3155,29 +3158,29 @@ define <16 x i16> @shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,2,4,5,6,7,8,8,11,10,12,13,14,15] -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7][1:0.50] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,2,4,5,6,7,8,8,11,10,12,13,14,15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u> +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u>[4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3185,27 +3188,27 @@ define <16 x i16> @shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7][1:0.50] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3][4:0.50] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 # [1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3,16,17,22,23,20,21,26,27,16,17,26,27,16,17,18,19] -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7][1:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3,16,17,22,23,20,21,26,27,16,17,26,27,16,17,18,19][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u> +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u>[4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3213,13 +3216,13 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-NEXT: retq +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[0,2,2,3,4,6,6,7][1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,2,2,3,4,6,6,7] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,2,2,3,4,6,6,7][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3227,26 +3230,26 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7][1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u> +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u>[4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3254,26 +3257,26 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm0[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm0[7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,2] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,2][1:1.00] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4:0.50] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3281,27 +3284,27 @@ define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7][1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u> +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u>[4:0.50] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3309,32 +3312,32 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6],xmm0[7] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6],xmm0[7][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7][1:0.50] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7][1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7][4:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3342,25 +3345,25 @@ define <16 x i16> @shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4,5,6],xmm3[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6],xmm0[7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4,5,6],xmm3[7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6],xmm0[7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12,13,14],ymm0[15] -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3][1:1.00] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12,13,14],ymm0[15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3368,32 +3371,32 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm1[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm1[7][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 # [1:1.00] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15][1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3401,31 +3404,31 @@ define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5][4:0.50] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5,16,17,20,21,20,21,22,23,16,17,20,21,24,25,20,21] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5,16,17,20,21,20,21,22,23,16,17,20,21,24,25,20,21][5:0.50] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7][1:0.50] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4:0.50] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3433,29 +3436,29 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15][1:0.50] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7][1:0.50] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3463,17 +3466,17 @@ define <16 x i16> @shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_uu: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9],ymm1[26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9],ymm1[26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3481,21 +3484,21 @@ define <16 x i16> @shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3503,16 +3506,16 @@ define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_uu: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3520,16 +3523,16 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_uu: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3537,32 +3540,32 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,8,9,0,1,4,5,10,11] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,8,9,0,1,4,5,10,11][5:0.50] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero[1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15][1:0.50] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7][1:0.50] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5][1:0.50] +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26][4:0.50] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3570,17 +3573,17 @@ define <16 x i16> @shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_uu: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5],ymm1[22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5],ymm1[22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3588,21 +3591,21 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5][1:0.50] +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3610,16 +3613,16 @@ define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_uu: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3627,16 +3630,16 @@ define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero[1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero[1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_uu: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero[1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3644,31 +3647,31 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,8,9,0,1,4,5,10,11] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,8,9,0,1,4,5,10,11][5:0.50] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero[1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15][1:0.50] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7][1:0.50] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5][1:0.50] +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3676,17 +3679,17 @@ define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_uu: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5],ymm0[22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5],ymm0[22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3694,30 +3697,30 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15][1:0.50] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7][1:0.50] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12][4:0.50] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3725,17 +3728,17 @@ define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9],ymm0[26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9],ymm0[26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3743,29 +3746,29 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,4] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,4][1:0.50] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3][1:0.50] +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4][1:0.50] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu: ; AVX2: # BB#0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8,9,10,11],ymm1[12],ymm0[13,14],ymm1[15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,6,7,6,7,8,9,8,9,10,11,14,15,30,31,30,31,22,23,22,23,24,25,24,25,26,27,30,31] -; AVX2-NEXT: retq +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8,9,10,11],ymm1[12],ymm0[13,14],ymm1[15][1:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,6,7,6,7,8,9,8,9,10,11,14,15,30,31,30,31,22,23,22,23,24,25,24,25,26,27,30,31][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u> +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u>[4:0.50] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3773,15 +3776,15 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19: ; AVX1: # BB#0: -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3789,17 +3792,17 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX1: # BB#0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3807,17 +3810,17 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3825,8 +3828,8 @@ define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { ; ALL-LABEL: shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u: ; ALL: # BB#0: -; ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; ALL-NEXT: retq +; ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; ALL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3834,9 +3837,9 @@ define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { ; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: ; ALL: # BB#0: -; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; ALL-NEXT: retq +; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7][1:0.50] +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; ALL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3844,17 +3847,17 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -3862,27 +3865,27 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25(<16 x i16> %a0, <16 x i16> %a1) { ; AVX1-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25: ; AVX1: # BB#0: -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19][5:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31][5:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25][4:0.50] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> ret <16 x i16> %1 } @@ -3890,37 +3893,37 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25(<16 x i16> %a0, <16 x i16> %a1) { ; AVX1-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,2,4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19][5:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31][5:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15][1:0.50] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25][4:0.50] ; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> %2 = bitcast <16 x i16> %1 to <4 x i64> %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> @@ -3931,9 +3934,9 @@ define <16 x i16> @insert_v16i16_0elt_into_zero_vector(i16* %ptr) { ; ALL-LABEL: insert_v16i16_0elt_into_zero_vector: ; ALL: # BB#0: -; ALL-NEXT: movzwl (%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm0 -; ALL-NEXT: retq +; ALL-NEXT: movzwl (%rdi), %eax # [5:0.50] +; ALL-NEXT: vmovd %eax, %xmm0 # [1:0.33] +; ALL-NEXT: retq # [5:1.00] %val = load i16, i16* %ptr %i0 = insertelement <16 x i16> zeroinitializer, i16 %val, i32 0 ret <16 x i16> %i0 @@ -3942,13 +3945,13 @@ define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31: ; AVX1: # BB#0: -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-NEXT: retq +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3][1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %alo = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> %shuf = shufflevector <8 x i16> %alo, <8 x i16> %bhi, <16 x i32> @@ -3958,8 +3961,8 @@ define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc(<16 x i16> %a, <16 x i16> %b) { ; ALL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: ; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: retq +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3][1:1.00] +; ALL-NEXT: retq # [5:1.00] %ahi = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> %bc0hi = bitcast <8 x i16> %ahi to <16 x i8> @@ -3972,45 +3975,45 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: PR24935: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,5,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,3,2,3,4,5,6,7,8,9,8,9,0,1,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6],xmm2[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,4,5,4,5,10,11,4,5,14,15,12,13,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5],xmm0[6],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5,6,7][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 # [1:1.00] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,5,5,6,7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,3,2,3,4,5,6,7,8,9,8,9,0,1,2,3][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6],xmm2[7][1:0.50] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3][1:0.50] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,4,5,4,5,10,11,4,5,14,15,12,13,0,1][5:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5],xmm0[6],xmm1[7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: PR24935: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0> -; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4],ymm2[5,6,7,8],ymm0[9,10],ymm2[11],ymm0[12],ymm2[13,14,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23][5:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17][5:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0>[4:0.50] +; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 # [2:1.00] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u][5:0.50] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15][1:0.50] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4],ymm2[5,6,7,8],ymm0[9,10],ymm2[11],ymm0[12],ymm2[13,14,15][1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255][4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # [2:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: PR24935: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8] +; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8][4:0.50] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -4018,15 +4021,15 @@ define <16 x i16> @insert_dup_mem_v16i16_i32(i32* %ptr) { ; AVX1-LABEL: insert_dup_mem_v16i16_i32: ; AVX1: # BB#0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_i32: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0 # [4:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16> @@ -4037,24 +4040,24 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) { ; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16: ; AVX1: # BB#0: -; AVX1-NEXT: movswl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: movswl (%rdi), %eax # [5:0.50] +; AVX1-NEXT: vmovd %eax, %xmm0 # [1:0.33] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16: ; AVX2: # BB#0: -; AVX2-NEXT: movswl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: movswl (%rdi), %eax # [5:0.50] +; AVX2-NEXT: vmovd %eax, %xmm0 # [1:0.33] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: movswl (%rdi), %eax +; AVX512VL-NEXT: movswl (%rdi), %eax # [5:0.50] ; AVX512VL-NEXT: vpbroadcastw %ax, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %tmp = load i16, i16* %ptr, align 2 %tmp1 = sext i16 %tmp to i32 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 @@ -4066,15 +4069,15 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(i32* %ptr) #0 { ; AVX1-LABEL: insert_dup_elt1_mem_v16i16_i32: ; AVX1: # BB#0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i16_i32: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0 # [4:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16> @@ -4085,15 +4088,15 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 { ; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32: ; AVX1: # BB#0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v16i16_i32: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0 # [4:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1 %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16> Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -8,15 +8,15 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -24,17 +24,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -42,17 +42,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -60,17 +60,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -78,17 +78,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -96,17 +96,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -114,17 +114,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -132,17 +132,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -150,17 +150,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -168,17 +168,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -186,17 +186,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -204,17 +204,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -222,17 +222,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -240,17 +240,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -258,17 +258,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -276,19 +276,19 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: movl $15, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: movl $15, %eax # [1:0.33] +; AVX1-NEXT: vmovd %eax, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0][5:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -296,35 +296,36 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 # [1:0.33] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255][4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # [2:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2 # [1:0.33] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 # [1:0.50] +; AVX512VL-NEXT: vpbroadcastb %xmm0, %xmm0 # [1:0.50] ; AVX512VL-NEXT: movl $32767, %eax # imm = 0x7FFF +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -332,30 +333,30 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2-NEXT: retq +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>[4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # [2:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: movl $1, %eax +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX512VL-NEXT: movl $1, %eax # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -363,30 +364,30 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2-NEXT: retq +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u>[4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # [2:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: movw $1, %ax +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX512VL-NEXT: movw $1, %ax # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -394,30 +395,30 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2-NEXT: retq +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u>[4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # [2:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: movw $1, %ax +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX512VL-NEXT: movw $1, %ax # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -425,20 +426,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -446,20 +447,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -467,20 +468,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -488,20 +489,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -509,27 +510,27 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,xmm2[8],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,xmm2[8],zero,zero,zero,zero,zero,zero,zero,zero[5:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 # [1:0.33] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -537,27 +538,27 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[9],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[9],zero,zero,zero,zero,zero,zero,zero,zero,zero[5:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 # [1:0.33] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -565,27 +566,27 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[5:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 # [1:0.33] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -593,27 +594,27 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[5:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 # [1:0.33] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -621,27 +622,27 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[5:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 # [1:0.33] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -649,27 +650,27 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[5:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 # [1:0.33] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -677,27 +678,27 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[14],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[14],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[5:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 # [1:0.33] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -705,33 +706,33 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: movl $128, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: movl $128, %eax # [1:0.33] +; AVX1-NEXT: vmovd %eax, %xmm2 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[1:0.50] +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 # [1:0.33] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] -; AVX2-NEXT: movl $15, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1][1:1.00] +; AVX2-NEXT: movl $15, %eax # [1:0.33] +; AVX2-NEXT: vmovd %eax, %xmm1 # [1:0.33] +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 # [1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: movl $15, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1][1:1.00] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7][1:0.50] +; AVX512VL-NEXT: movl $15, %eax # [1:0.33] +; AVX512VL-NEXT: vmovd %eax, %xmm1 # [1:0.33] +; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 # [1:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -739,18 +740,18 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2OR512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 # [1:0.33] +; AVX2OR512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 # [1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -758,17 +759,17 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -776,17 +777,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -794,17 +795,17 @@ define <32 x i8> @shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15,23,23,23,23,23,23,23,23,31,31,31,31,31,31,31,31] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15,23,23,23,23,23,23,23,23,31,31,31,31,31,31,31,31][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -812,17 +813,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -830,17 +831,17 @@ define <32 x i8> @shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15,19,19,19,19,23,23,23,23,27,27,27,27,31,31,31,31] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15,19,19,19,19,23,23,23,23,27,27,27,27,31,31,31,31][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -848,17 +849,17 @@ define <32 x i8> @shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -866,17 +867,17 @@ define <32 x i8> @shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15,17,17,19,19,21,21,23,23,25,25,27,27,29,29,31,31] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15,17,17,19,19,21,21,23,23,25,25,27,27,29,29,31,31][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -884,21 +885,21 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0][5:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0][5:0.50] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -906,21 +907,21 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0][5:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0][5:0.50] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -928,21 +929,21 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0][5:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0][5:0.50] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -950,21 +951,21 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0][5:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0][5:0.50] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -972,21 +973,21 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -994,27 +995,27 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: movl $15, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: movl $15, %eax # [1:0.33] +; AVX1-NEXT: vmovd %eax, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: movl $15, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: retq +; AVX2-NEXT: movl $15, %eax # [1:0.33] +; AVX2-NEXT: vmovd %eax, %xmm1 # [1:0.33] +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: movl $15, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: movl $15, %eax # [1:0.33] +; AVX512VL-NEXT: vmovd %eax, %xmm1 # [1:0.33] +; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1][1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1022,24 +1023,25 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: ; AVX1: # BB#0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 # [1:0.33] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 # [1:0.33] +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 # [1:0.33] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 # [2:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1047,24 +1049,25 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: ; AVX1: # BB#0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 # [1:0.33] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 # [1:0.33] +; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 # [1:0.33] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # [2:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1072,15 +1075,16 @@ define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) { ; AVX1OR2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: ; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 # [5:0.50] +; AVX1OR2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm0 {%k1} {z} -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle } @@ -1088,14 +1092,14 @@ define <32 x i8> @shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1],zero,xmm0[2],zero,xmm0[4,u,6,7,8,9,10,11,12,13,14,15] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1],zero,xmm0[2],zero,xmm0[4,u,6,7,8,9,10,11,12,13,14,15][5:0.50] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3][1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[2],zero,ymm0[4,u,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[2],zero,ymm0[4,u,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle } @@ -1103,17 +1107,17 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32: ; AVX1: # BB#0: -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1121,36 +1125,37 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX1: # BB#0: -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 # [1:0.33] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 # [1:0.50] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15][1:0.50] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5][1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 # [2:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5][1:0.50] +; AVX512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2 # [1:0.33] ; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1158,23 +1163,23 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7][1:0.50] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX2OR512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2 # [1:0.33] +; AVX2OR512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 # [1:0.50] +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1182,24 +1187,24 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u>[4:0.50] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u>[4:0.50] +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 # [1:0.50] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0][1:0.50] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1207,22 +1212,22 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7][1:0.50] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1][4:0.50] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 # [1:0.50] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7][1:0.50] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16][5:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u][5:0.50] +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1230,17 +1235,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1248,17 +1253,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,18,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,18,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1266,17 +1271,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1284,17 +1289,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1302,17 +1307,17 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,30,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,30,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1320,18 +1325,18 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX1: # BB#0: -; AVX1-NEXT: movl $15, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: movl $15, %eax # [1:0.33] +; AVX1-NEXT: vmovd %eax, %xmm1 # [1:0.33] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1339,17 +1344,17 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1357,17 +1362,17 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15][1:0.50] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1375,28 +1380,29 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15][1:0.50] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u][5:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31][5:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 # [2:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u][5:0.50] ; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1404,28 +1410,29 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7][1:0.50] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u][5:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23][5:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 # [2:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u][5:0.50] ; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1433,16 +1440,16 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,17,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,17,16,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1450,16 +1457,16 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,18,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,18,16,16,16,16,16,16,16,16,16,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1467,16 +1474,16 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1484,16 +1491,16 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1501,16 +1508,16 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,30,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,30,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1518,18 +1525,18 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31: ; AVX1: # BB#0: -; AVX1-NEXT: movl $15, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: movl $15, %eax # [1:0.33] +; AVX1-NEXT: vmovd %eax, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,31] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,31][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1537,16 +1544,16 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,28,28,28,28,24,24,24,24,20,20,20,20,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,28,28,28,28,24,24,24,24,20,20,20,20,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1554,16 +1561,16 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1571,15 +1578,15 @@ define <32 x i8> @shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,u,u,u,u,u,0,0,0,0,0,14,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,u,u,u,u,u,0,0,0,0,0,14,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,u,u,u,u,u,16,16,16,16,16,30,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,u,u,u,u,u,16,16,16,16,16,30,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1587,16 +1594,16 @@ define <32 x i8> @shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,14,1,1,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,u,0,u,u,u,u,0,0,0,0,0,0,14,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,14,1,1,0,0,0,0,0,0,0,0,0,0,0,0][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,u,0,u,u,u,u,0,0,0,0,0,0,14,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,14,u,u,0,0,0,0,0,0,0,0,0,0,0,0,16,16,u,16,u,u,u,u,16,16,16,16,16,16,30,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,14,u,u,0,0,0,0,0,0,0,0,0,0,0,0,16,16,u,16,u,u,u,u,16,16,16,16,16,16,30,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1604,16 +1611,16 @@ define <32 x i8> @shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12][5:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,u,u,u,4,u,8,8,8,8,u,u,12,u,28,28,28,28,u,u,u,24,20,20,20,20,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,u,u,u,4,u,8,8,8,8,u,u,12,u,28,28,28,28,u,u,u,24,20,20,20,20,16,16,16,16][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1621,17 +1628,17 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24: ; AVX1: # BB#0: -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,u,u,u,u,u,u,u,u,16,16,16,u,u,u,u,u,u,u,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,u,u,u,u,u,u,u,u,16,16,16,u,u,u,u,u,u,u,24,24,24,24,24,24][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1639,57 +1646,59 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,u,1,6],zero,zero,xmm2[0],zero,xmm2[11,u],zero,zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,6,u,6,u,u,u,u,u,u,u,15,u,u,u,u] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255] -; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[u,u],zero,zero,xmm2[12],zero,xmm2[u,u,u],zero,zero,xmm2[u,0,3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u],zero,zero,xmm4[u,u,u,u,1,6,13,u,u],zero,xmm4[u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255] -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,u,1,6],zero,zero,xmm2[0],zero,xmm2[11,u],zero,zero,zero,zero[5:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7][5:0.50] +; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 # [1:0.33] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7][1:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,6,u,6,u,u,u,u,u,u,u,15,u,u,u,u][5:0.50] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255][4:0.50] +; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm5, %xmm3 # [2:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[u,u],zero,zero,xmm2[12],zero,xmm2[u,u,u],zero,zero,xmm2[u,0,3][5:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero[5:0.50] +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u],zero,zero,xmm4[u,u,u,u,1,6,13,u,u],zero,xmm4[u,u][5:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u][5:0.50] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 # [1:0.33] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255][4:0.50] +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # [2:1.00] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,255,255,0,255,u,u,u,255,255,u,0,0,u,u,255,u,255,255,0,0,255,0,255,u,0,0,0,0> -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23][5:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u][5:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,255,255,0,255,u,u,u,255,255,u,0,0,u,u,255,u,255,255,0,0,255,0,255,u,0,0,0,0>[4:0.50] +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 # [2:1.00] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1][1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u][5:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u][5:0.50] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7][1:0.50] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255][4:0.50] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # [2:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1][1:1.00] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u][5:0.50] ; AVX512VL-NEXT: movl $-222248896, %eax # imm = 0xF2C0C040 +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1][1:1.00] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u][5:0.50] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u][5:0.50] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7][1:0.50] ; AVX512VL-NEXT: movl $134948620, %eax # imm = 0x80B270C +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 # [1:0.33] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1697,17 +1706,17 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1715,18 +1724,18 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1][1:1.00] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1734,19 +1743,19 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3][1:1.00] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1754,18 +1763,18 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7][1:0.50] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24][5:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1773,17 +1782,17 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: ; AVX1: # BB#0: -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15][1:0.50] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15][1:0.50] +; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7][1:0.50] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1791,16 +1800,16 @@ define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48: ; AVX1: # BB#0: -; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> ret <32 x i8> %shuffle } @@ -1808,16 +1817,16 @@ define <32 x i8> @shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> ret <32 x i8> %shuffle } @@ -1829,16 +1838,16 @@ define <32 x i8> @shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30: ; AVX1: # BB#0: -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpsllw $8, %ymm0, %ymm0 # [1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle } @@ -1846,16 +1855,16 @@ define <32 x i8> @shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29: ; AVX1: # BB#0: -; AVX1-NEXT: vpslld $16, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpslld $16, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpslld $16, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpslld $16, %ymm0, %ymm0 # [1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle } @@ -1863,16 +1872,16 @@ define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25: ; AVX1: # BB#0: -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpsllq $48, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpsllq $48, %ymm0, %ymm0 # [1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle } @@ -1880,16 +1889,16 @@ define <32 x i8> @shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 # [1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle } @@ -1897,16 +1906,16 @@ define <32 x i8> @shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpsrld $16, %ymm0, %ymm0 # [1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle } @@ -1914,16 +1923,16 @@ define <32 x i8> @shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrlq $56, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $56, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpsrlq $56, %xmm0, %xmm1 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpsrlq $56, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpsrlq $56, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpsrlq $56, %ymm0, %ymm0 # [1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle } @@ -1931,16 +1940,16 @@ define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero[1:0.50] +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero[1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero[1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> ret <32 x i8> %shuffle } @@ -1948,16 +1957,16 @@ define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero[1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3][1:0.50] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero[1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero[1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> ret <32 x i8> %shuffle } @@ -1965,16 +1974,16 @@ define <32 x i8> @shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15][1:0.50] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero[1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero[1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> ret <32 x i8> %shuffle } @@ -1982,37 +1991,38 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1][1:0.50] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero[1:0.50] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3][1:0.50] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero[1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1][1:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero[1:0.50] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3][1:0.50] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero[1:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 # [5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1][1:0.50] +; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero[1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3][1:0.50] +; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero[1:0.50] +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] ; AVX512VL-NEXT: movl $286331153, %eax # imm = 0x11111111 +; AVX512VL-NEXT: # [1:0.33] ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm0 {%k1} {z} -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> ret <32 x i8> %shuffle } @@ -2020,17 +2030,17 @@ define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2038,17 +2048,17 @@ define <32 x i8> @shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2056,16 +2066,16 @@ define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_uu_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_uu_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; AVX1: # BB#0: -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_uu_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2073,16 +2083,16 @@ define <32 x i8> @shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero[1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2090,16 +2100,16 @@ define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2107,17 +2117,17 @@ define <32 x i8> @shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_48(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_48: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_48: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2125,17 +2135,17 @@ define <32 x i8> @shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63_16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2143,17 +2153,17 @@ define <32 x i8> @shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_31_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_31_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_31_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm0[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm0[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2161,16 +2171,16 @@ define <32 x i8> @shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_16: ; AVX1: # BB#0: -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2178,16 +2188,16 @@ define <32 x i8> @shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; AVX1: # BB#0: -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14][1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,31,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,31,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30][1:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2195,23 +2205,23 @@ define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10][5:0.50] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15][1:0.50] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1][1:0.50] +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2219,17 +2229,17 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %ymm0 # [1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2237,27 +2247,27 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: retq +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15][5:0.50] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7][1:0.50] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0][1:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: retq +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 # [1:0.50] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15][5:0.50] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0][1:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 # [1:0.50] +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3][1:0.50] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7][1:0.50] +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0][1:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2265,15 +2275,15 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { ; AVX1OR2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: retq +; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15][5:0.50] +; AVX1OR2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15][1:0.50] +; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3][1:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2281,23 +2291,23 @@ define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6][5:0.50] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] -; AVX2-NEXT: retq +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6][5:0.50] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 # [1:1.00] +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7][1:0.50] +; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6][1:0.50] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3][1:0.50] +; AVX512VL-NEXT: retq # [5:1.00] %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -2305,27 +2315,27 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) { ; AVX1-LABEL: PR28136: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,10,10,12,12,14,14,9,9,11,11,13,13,15,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,2,2,4,4,6,6,1,1,3,3,5,5,7,7] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7][1:0.50] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,10,10,12,12,14,14,9,9,11,11,13,13,15,15][4:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 # [1:0.50] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 # [1:1.00] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7][1:0.50] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 # [1:0.50] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 # [2:1.00] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,2,2,4,4,6,6,1,1,3,3,5,5,7,7][4:0.50] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 # [1:0.50] +; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0 # [2:1.00] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: PR28136: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23][1:0.50] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3][1:1.00] +; AVX2OR512VL-NEXT: retq # [5:1.00] %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> %2 = bitcast <32 x i8> %1 to <4 x i64> %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> @@ -2335,16 +2345,16 @@ define <32 x i8> @insert_dup_mem_v32i8_i32(i32* %ptr) { ; AVX1-LABEL: insert_dup_mem_v32i8_i32: ; AVX1: # BB#0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: insert_dup_mem_v32i8_i32: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %ymm0 # [4:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> @@ -2355,17 +2365,17 @@ define <32 x i8> @insert_dup_mem_v32i8_sext_i8(i8* %ptr) { ; AVX1-LABEL: insert_dup_mem_v32i8_sext_i8: ; AVX1: # BB#0: -; AVX1-NEXT: movsbl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: movsbl (%rdi), %eax # [5:0.50] +; AVX1-NEXT: vmovd %eax, %xmm0 # [1:0.33] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 # [1:0.33] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # [1:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: insert_dup_mem_v32i8_sext_i8: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %ymm0 # [4:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %tmp = load i8, i8* %ptr, align 1 %tmp1 = sext i8 %tmp to i32 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 @@ -2377,15 +2387,15 @@ define <32 x i8> @insert_dup_elt1_mem_v32i8_i32(i32* %ptr) { ; AVX1-LABEL: insert_dup_elt1_mem_v32i8_i32: ; AVX1: # BB#0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v32i8_i32: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastb 1(%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastb 1(%rdi), %ymm0 # [4:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> @@ -2396,15 +2406,15 @@ define <32 x i8> @insert_dup_elt3_mem_v32i8_i32(i32* %ptr) { ; AVX1-LABEL: insert_dup_elt3_mem_v32i8_i32: ; AVX1: # BB#0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero[4:0.50] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v32i8_i32: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpbroadcastb 3(%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2OR512VL-NEXT: vpbroadcastb 3(%rdi), %ymm0 # [4:0.50] +; AVX2OR512VL-NEXT: retq # [5:1.00] %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> @@ -2415,26 +2425,26 @@ define <32 x i8> @insert_dup_elt1_mem_v32i8_sext_i8(i8* %ptr) { ; AVX1-LABEL: insert_dup_elt1_mem_v32i8_sext_i8: ; AVX1: # BB#0: -; AVX1-NEXT: movsbl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-NEXT: movsbl (%rdi), %eax # [5:0.50] +; AVX1-NEXT: vmovd %eax, %xmm0 # [1:0.33] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1][5:0.50] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # [1:1.00] +; AVX1-NEXT: retq # [5:1.00] ; ; AVX2-LABEL: insert_dup_elt1_mem_v32i8_sext_i8: ; AVX2: # BB#0: -; AVX2-NEXT: movsbl (%rdi), %eax -; AVX2-NEXT: shrl $8, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-NEXT: movsbl (%rdi), %eax # [5:0.50] +; AVX2-NEXT: shrl $8, %eax # [1:0.50] +; AVX2-NEXT: vmovd %eax, %xmm0 # [1:0.33] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 # [1:1.00] +; AVX2-NEXT: retq # [5:1.00] ; ; AVX512VL-LABEL: insert_dup_elt1_mem_v32i8_sext_i8: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: movsbl (%rdi), %eax -; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: movsbl (%rdi), %eax # [5:0.50] +; AVX512VL-NEXT: shrl $8, %eax # [1:0.50] ; AVX512VL-NEXT: vpbroadcastb %al, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-NEXT: retq # [5:1.00] %tmp = load i8, i8* %ptr, align 1 %tmp1 = sext i8 %tmp to i32 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 Index: test/CodeGen/X86/vector-shuffle-avx512.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-avx512.ll +++ test/CodeGen/X86/vector-shuffle-avx512.ll @@ -9,32 +9,32 @@ ; SKX64-LABEL: expand: ; SKX64: # BB#0: ; SKX64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; SKX64-NEXT: movb $5, %al +; SKX64-NEXT: movb $5, %al # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand: ; KNL64: # BB#0: -; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; KNL64-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7] -; KNL64-NEXT: retq +; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3][1:1.00] +; KNL64-NEXT: vxorps %ymm1, %ymm1, %ymm1 # [1:1.00] +; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7][1:0.33] +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand: ; SKX32: # BB#0: ; SKX32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; SKX32-NEXT: movb $5, %al +; SKX32-NEXT: movb $5, %al # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand: ; KNL32: # BB#0: -; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; KNL32-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7] -; KNL32-NEXT: retl +; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3][1:1.00] +; KNL32-NEXT: vxorps %ymm1, %ymm1, %ymm1 # [1:1.00] +; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7][1:0.33] +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> ret <8 x float> %res } @@ -43,36 +43,36 @@ ; SKX64-LABEL: expand1: ; SKX64: # BB#0: ; SKX64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; SKX64-NEXT: movb $-86, %al +; SKX64-NEXT: movb $-86, %al # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand1: ; KNL64: # BB#0: ; KNL64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL64-NEXT: vmovaps {{.*#+}} ymm1 = -; KNL64-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; KNL64-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; KNL64-NEXT: retq +; KNL64-NEXT: vmovaps {{.*#+}} ymm1 = [4:0.50] +; KNL64-NEXT: vpermps %ymm0, %ymm1, %ymm0 # [3:1.00] +; KNL64-NEXT: vxorps %ymm1, %ymm1, %ymm1 # [1:1.00] +; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7][1:0.33] +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand1: ; SKX32: # BB#0: ; SKX32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; SKX32-NEXT: movb $-86, %al +; SKX32-NEXT: movb $-86, %al # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand1: ; KNL32: # BB#0: ; KNL32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL32-NEXT: vmovaps {{.*#+}} ymm1 = -; KNL32-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; KNL32-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; KNL32-NEXT: retl +; KNL32-NEXT: vmovaps {{.*#+}} ymm1 = [4:0.50] +; KNL32-NEXT: vpermps %ymm0, %ymm1, %ymm0 # [3:1.00] +; KNL32-NEXT: vxorps %ymm1, %ymm1, %ymm1 # [1:1.00] +; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7][1:0.33] +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> ret <8 x float> %res } @@ -82,34 +82,34 @@ ; SKX64-LABEL: expand2: ; SKX64: # BB#0: ; SKX64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; SKX64-NEXT: movb $9, %al +; SKX64-NEXT: movb $9, %al # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vexpandpd %ymm0, %ymm0 {%k1} {z} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand2: ; KNL64: # BB#0: ; KNL64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] -; KNL64-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; KNL64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] -; KNL64-NEXT: retq +; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1][3:1.00] +; KNL64-NEXT: vxorpd %ymm1, %ymm1, %ymm1 # [1:1.00] +; KNL64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3][1:0.33] +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand2: ; SKX32: # BB#0: ; SKX32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; SKX32-NEXT: movb $9, %al +; SKX32-NEXT: movb $9, %al # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vexpandpd %ymm0, %ymm0 {%k1} {z} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand2: ; KNL32: # BB#0: ; KNL32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] -; KNL32-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; KNL32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] -; KNL32-NEXT: retl +; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1][3:1.00] +; KNL32-NEXT: vxorpd %ymm1, %ymm1, %ymm1 # [1:1.00] +; KNL32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3][1:0.33] +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> ret <4 x double> %res } @@ -119,32 +119,32 @@ ; SKX64-LABEL: expand3: ; SKX64: # BB#0: ; SKX64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; SKX64-NEXT: movb $-127, %al +; SKX64-NEXT: movb $-127, %al # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand3: ; KNL64: # BB#0: -; KNL64-NEXT: vpbroadcastq %xmm0, %ymm0 -; KNL64-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; KNL64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7] -; KNL64-NEXT: retq +; KNL64-NEXT: vpbroadcastq %xmm0, %ymm0 # [3:1.00] +; KNL64-NEXT: vpxor %ymm1, %ymm1, %ymm1 # [1:0.33] +; KNL64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7][1:0.33] +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand3: ; SKX32: # BB#0: ; SKX32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; SKX32-NEXT: movb $-127, %al +; SKX32-NEXT: movb $-127, %al # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand3: ; KNL32: # BB#0: -; KNL32-NEXT: vpbroadcastq %xmm0, %ymm0 -; KNL32-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; KNL32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7] -; KNL32-NEXT: retl +; KNL32-NEXT: vpbroadcastq %xmm0, %ymm0 # [3:1.00] +; KNL32-NEXT: vpxor %ymm1, %ymm1, %ymm1 # [1:0.33] +; KNL32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7][1:0.33] +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> ret <8 x i32> %res } @@ -154,34 +154,34 @@ ; SKX64-LABEL: expand4: ; SKX64: # BB#0: ; SKX64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; SKX64-NEXT: movb $9, %al +; SKX64-NEXT: movb $9, %al # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand4: ; KNL64: # BB#0: ; KNL64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; KNL64-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; KNL64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] -; KNL64-NEXT: retq +; KNL64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1][3:1.00] +; KNL64-NEXT: vpxor %ymm1, %ymm1, %ymm1 # [1:0.33] +; KNL64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7][1:0.33] +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand4: ; SKX32: # BB#0: ; SKX32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; SKX32-NEXT: movb $9, %al +; SKX32-NEXT: movb $9, %al # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand4: ; KNL32: # BB#0: ; KNL32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; KNL32-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; KNL32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] -; KNL32-NEXT: retl +; KNL32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1][3:1.00] +; KNL32-NEXT: vpxor %ymm1, %ymm1, %ymm1 # [1:0.33] +; KNL32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7][1:0.33] +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> ret <4 x i64> %res } @@ -190,31 +190,31 @@ define <8 x float> @expand5(<4 x float> %a ) { ; SKX64-LABEL: expand5: ; SKX64: # BB#0: -; SKX64-NEXT: vbroadcastss %xmm0, %ymm0 -; SKX64-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; SKX64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; SKX64-NEXT: retq +; SKX64-NEXT: vbroadcastss %xmm0, %ymm0 # [3:1.00] +; SKX64-NEXT: vxorps %ymm1, %ymm1, %ymm1 # [1:1.00] +; SKX64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7][1:0.33] +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand5: ; KNL64: # BB#0: -; KNL64-NEXT: vbroadcastss %xmm0, %ymm0 -; KNL64-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; KNL64-NEXT: retq +; KNL64-NEXT: vbroadcastss %xmm0, %ymm0 # [3:1.00] +; KNL64-NEXT: vxorps %ymm1, %ymm1, %ymm1 # [1:1.00] +; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7][1:0.33] +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand5: ; SKX32: # BB#0: -; SKX32-NEXT: vbroadcastss %xmm0, %ymm0 -; SKX32-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; SKX32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; SKX32-NEXT: retl +; SKX32-NEXT: vbroadcastss %xmm0, %ymm0 # [3:1.00] +; SKX32-NEXT: vxorps %ymm1, %ymm1, %ymm1 # [1:1.00] +; SKX32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7][1:0.33] +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand5: ; KNL32: # BB#0: -; KNL32-NEXT: vbroadcastss %xmm0, %ymm0 -; KNL32-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; KNL32-NEXT: retl +; KNL32-NEXT: vbroadcastss %xmm0, %ymm0 # [3:1.00] +; KNL32-NEXT: vxorps %ymm1, %ymm1, %ymm1 # [1:1.00] +; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7][1:0.33] +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> ret <8 x float> %res } @@ -223,27 +223,27 @@ define <8 x float> @expand6(<4 x float> %a ) { ; SKX64-LABEL: expand6: ; SKX64: # BB#0: -; SKX64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; SKX64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; SKX64-NEXT: retq +; SKX64-NEXT: vxorps %xmm1, %xmm1, %xmm1 # [1:1.00] +; SKX64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [3:1.00] +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand6: ; KNL64: # BB#0: -; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; KNL64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; KNL64-NEXT: retq +; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1 # [1:1.00] +; KNL64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [3:1.00] +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand6: ; SKX32: # BB#0: -; SKX32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; SKX32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; SKX32-NEXT: retl +; SKX32-NEXT: vxorps %xmm1, %xmm1, %xmm1 # [1:1.00] +; SKX32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [3:1.00] +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand6: ; KNL32: # BB#0: -; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; KNL32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; KNL32-NEXT: retl +; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1 # [1:1.00] +; KNL32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 # [3:1.00] +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> ret <8 x float> %res } @@ -253,33 +253,37 @@ ; SKX64: # BB#0: ; SKX64-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; SKX64-NEXT: movw $1285, %ax # imm = 0x505 +; SKX64-NEXT: # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand7: ; KNL64: # BB#0: ; KNL64-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; KNL64-NEXT: movw $1285, %ax # imm = 0x505 +; KNL64-NEXT: # [1:0.25] ; KNL64-NEXT: kmovw %eax, %k1 ; KNL64-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} -; KNL64-NEXT: retq +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand7: ; SKX32: # BB#0: ; SKX32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; SKX32-NEXT: movw $1285, %ax # imm = 0x505 +; SKX32-NEXT: # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand7: ; KNL32: # BB#0: ; KNL32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; KNL32-NEXT: movw $1285, %ax # imm = 0x505 +; KNL32-NEXT: # [1:0.25] ; KNL32-NEXT: kmovw %eax, %k1 ; KNL32-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} -; KNL32-NEXT: retl +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <16 x i32> ret <16 x float> %res } @@ -289,33 +293,37 @@ ; SKX64: # BB#0: ; SKX64-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; SKX64-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX64-NEXT: # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand8: ; KNL64: # BB#0: ; KNL64-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; KNL64-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL64-NEXT: # [1:0.25] ; KNL64-NEXT: kmovw %eax, %k1 ; KNL64-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} -; KNL64-NEXT: retq +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand8: ; SKX32: # BB#0: ; SKX32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; SKX32-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX32-NEXT: # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand8: ; KNL32: # BB#0: ; KNL32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; KNL32-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL32-NEXT: # [1:0.25] ; KNL32-NEXT: kmovw %eax, %k1 ; KNL32-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} -; KNL32-NEXT: retl +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> ret <16 x float> %res } @@ -325,34 +333,34 @@ ; SKX64-LABEL: expand9: ; SKX64: # BB#0: ; SKX64-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; SKX64-NEXT: movb $-127, %al +; SKX64-NEXT: movb $-127, %al # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand9: ; KNL64: # BB#0: ; KNL64-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; KNL64-NEXT: movb $-127, %al +; KNL64-NEXT: movb $-127, %al # [1:0.25] ; KNL64-NEXT: kmovw %eax, %k1 ; KNL64-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z} -; KNL64-NEXT: retq +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand9: ; SKX32: # BB#0: ; SKX32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; SKX32-NEXT: movb $-127, %al +; SKX32-NEXT: movb $-127, %al # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand9: ; KNL32: # BB#0: ; KNL32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; KNL32-NEXT: movb $-127, %al +; KNL32-NEXT: movb $-127, %al # [1:0.25] ; KNL32-NEXT: kmovw %eax, %k1 ; KNL32-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z} -; KNL32-NEXT: retl +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <8 x i32> ret <8 x double> %res } @@ -362,33 +370,37 @@ ; SKX64: # BB#0: ; SKX64-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; SKX64-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX64-NEXT: # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand10: ; KNL64: # BB#0: ; KNL64-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; KNL64-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL64-NEXT: # [1:0.25] ; KNL64-NEXT: kmovw %eax, %k1 ; KNL64-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} -; KNL64-NEXT: retq +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand10: ; SKX32: # BB#0: ; SKX32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; SKX32-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX32-NEXT: # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand10: ; KNL32: # BB#0: ; KNL32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; KNL32-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL32-NEXT: # [1:0.25] ; KNL32-NEXT: kmovw %eax, %k1 ; KNL32-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} -; KNL32-NEXT: retl +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <16 x i32> ret <16 x i32> %res } @@ -397,34 +409,34 @@ ; SKX64-LABEL: expand11: ; SKX64: # BB#0: ; SKX64-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; SKX64-NEXT: movb $-127, %al +; SKX64-NEXT: movb $-127, %al # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand11: ; KNL64: # BB#0: ; KNL64-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; KNL64-NEXT: movb $-127, %al +; KNL64-NEXT: movb $-127, %al # [1:0.25] ; KNL64-NEXT: kmovw %eax, %k1 ; KNL64-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} -; KNL64-NEXT: retq +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand11: ; SKX32: # BB#0: ; SKX32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; SKX32-NEXT: movb $-127, %al +; SKX32-NEXT: movb $-127, %al # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand11: ; KNL32: # BB#0: ; KNL32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; KNL32-NEXT: movb $-127, %al +; KNL32-NEXT: movb $-127, %al # [1:0.25] ; KNL32-NEXT: kmovw %eax, %k1 ; KNL32-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} -; KNL32-NEXT: retl +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <8 x i32> ret <8 x i64> %res } @@ -434,38 +446,38 @@ ; SKX64-LABEL: expand12: ; SKX64: # BB#0: ; SKX64-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; SKX64-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16] +; SKX64-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16][4:0.50] ; SKX64-NEXT: vxorps %zmm1, %zmm1, %zmm1 ; SKX64-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 ; SKX64-NEXT: vmovaps %zmm1, %zmm0 -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand12: ; KNL64: # BB#0: ; KNL64-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; KNL64-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16] -; KNL64-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; KNL64-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16][4:0.50] +; KNL64-NEXT: vpxord %zmm1, %zmm1, %zmm1 # [?:0.000000e+00] ; KNL64-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 ; KNL64-NEXT: vmovaps %zmm1, %zmm0 -; KNL64-NEXT: retq +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand12: ; SKX32: # BB#0: ; SKX32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; SKX32-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16] +; SKX32-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16][4:0.50] ; SKX32-NEXT: vxorps %zmm1, %zmm1, %zmm1 ; SKX32-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 ; SKX32-NEXT: vmovaps %zmm1, %zmm0 -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand12: ; KNL32: # BB#0: ; KNL32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; KNL32-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16] -; KNL32-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; KNL32-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16][4:0.50] +; KNL32-NEXT: vpxord %zmm1, %zmm1, %zmm1 # [?:0.000000e+00] ; KNL32-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 ; KNL32-NEXT: vmovaps %zmm1, %zmm0 -; KNL32-NEXT: retl +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> ret <16 x float> %res } @@ -473,27 +485,27 @@ define <16 x float> @expand13(<8 x float> %a ) { ; SKX64-LABEL: expand13: ; SKX64: # BB#0: -; SKX64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; SKX64-NEXT: vxorps %ymm1, %ymm1, %ymm1 # [1:1.00] ; SKX64-NEXT: vinsertf32x8 $1, %ymm0, %zmm1, %zmm0 -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand13: ; KNL64: # BB#0: -; KNL64-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; KNL64-NEXT: vxorpd %ymm1, %ymm1, %ymm1 # [1:1.00] ; KNL64-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 -; KNL64-NEXT: retq +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand13: ; SKX32: # BB#0: -; SKX32-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; SKX32-NEXT: vxorps %ymm1, %ymm1, %ymm1 # [1:1.00] ; SKX32-NEXT: vinsertf32x8 $1, %ymm0, %zmm1, %zmm0 -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand13: ; KNL32: # BB#0: -; KNL32-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; KNL32-NEXT: vxorpd %ymm1, %ymm1, %ymm1 # [1:1.00] ; KNL32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 -; KNL32-NEXT: retl +; KNL32-NEXT: retl # [1:1.00] %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> ret <16 x float> %res } @@ -504,38 +516,38 @@ ; SKX64-LABEL: expand14: ; SKX64: # BB#0: ; SKX64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; SKX64-NEXT: movb $20, %al +; SKX64-NEXT: movb $20, %al # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand14: ; KNL64: # BB#0: -; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL64-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0] -; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] -; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] -; KNL64-NEXT: retq +; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3][1:1.00] +; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3][3:1.00] +; KNL64-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>[4:0.50] +; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0][1:1.00] +; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1][3:1.00] +; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7][1:0.33] +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand14: ; SKX32: # BB#0: ; SKX32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; SKX32-NEXT: movb $20, %al +; SKX32-NEXT: movb $20, %al # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand14: ; KNL32: # BB#0: -; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL32-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0] -; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] -; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] -; KNL32-NEXT: retl +; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3][1:1.00] +; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3][3:1.00] +; KNL32-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>[4:0.50] +; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0][1:1.00] +; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1][3:1.00] +; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7][1:0.33] +; KNL32-NEXT: retl # [1:1.00] %addV = fadd <4 x float> , %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> ret <8 x float> %res @@ -545,41 +557,41 @@ define <8 x float> @expand15(<4 x float> %a) { ; SKX64-LABEL: expand15: ; SKX64: # BB#0: -; SKX64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SKX64-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u> -; SKX64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0] -; SKX64-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3] +; SKX64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3][1:1.00] +; SKX64-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>[4:0.50] +; SKX64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0][1:1.00] +; SKX64-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3][4:0.50] ; SKX64-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0 -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: expand15: ; KNL64: # BB#0: -; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL64-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0] -; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] -; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] -; KNL64-NEXT: retq +; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3][1:1.00] +; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3][3:1.00] +; KNL64-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>[4:0.50] +; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0][1:1.00] +; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1][3:1.00] +; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7][1:0.33] +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: expand15: ; SKX32: # BB#0: -; SKX32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SKX32-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u> -; SKX32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0] -; SKX32-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3] +; SKX32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3][1:1.00] +; SKX32-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>[4:0.50] +; SKX32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0][1:1.00] +; SKX32-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3][4:0.50] ; SKX32-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0 -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: expand15: ; KNL32: # BB#0: -; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL32-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0] -; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] -; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] -; KNL32-NEXT: retl +; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3][1:1.00] +; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3][3:1.00] +; KNL32-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>[4:0.50] +; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0][1:1.00] +; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1][3:1.00] +; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7][1:0.33] +; KNL32-NEXT: retl # [1:1.00] %addV = fadd <4 x float> , %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> ret <8 x float> %res @@ -592,43 +604,45 @@ ; SKX64-LABEL: test_mm512_mask_blend_epi8: ; SKX64: # BB#0: # %entry ; SKX64-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA +; SKX64-NEXT: # [1:0.25] ; SKX64-NEXT: kmovq %rax, %k1 ; SKX64-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: test_mm512_mask_blend_epi8: ; KNL64: # BB#0: # %entry -; KNL64-NEXT: vpbroadcastw {{.*}}(%rip), %ymm4 -; KNL64-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; KNL64-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 -; KNL64-NEXT: retq +; KNL64-NEXT: vpbroadcastw {{.*}}(%rip), %ymm4 # [7:1.00] +; KNL64-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 # [2:2.00] +; KNL64-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 # [2:2.00] +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: test_mm512_mask_blend_epi8: ; SKX32: # BB#0: # %entry ; SKX32-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; SKX32-NEXT: # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k0 ; SKX32-NEXT: kunpckdq %k0, %k0, %k1 ; SKX32-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: test_mm512_mask_blend_epi8: ; KNL32: # BB#0: # %entry -; KNL32-NEXT: pushl %ebp +; KNL32-NEXT: pushl %ebp # [1:1.00] ; KNL32-NEXT: .Lcfi0: ; KNL32-NEXT: .cfi_def_cfa_offset 8 ; KNL32-NEXT: .Lcfi1: ; KNL32-NEXT: .cfi_offset %ebp, -8 -; KNL32-NEXT: movl %esp, %ebp +; KNL32-NEXT: movl %esp, %ebp # [1:0.25] ; KNL32-NEXT: .Lcfi2: ; KNL32-NEXT: .cfi_def_cfa_register %ebp -; KNL32-NEXT: andl $-32, %esp -; KNL32-NEXT: subl $32, %esp -; KNL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm3 -; KNL32-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; KNL32-NEXT: vpblendvb %ymm3, 8(%ebp), %ymm1, %ymm1 -; KNL32-NEXT: movl %ebp, %esp -; KNL32-NEXT: popl %ebp -; KNL32-NEXT: retl +; KNL32-NEXT: andl $-32, %esp # [1:0.25] +; KNL32-NEXT: subl $32, %esp # [1:0.25] +; KNL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm3 # [7:1.00] +; KNL32-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 # [2:2.00] +; KNL32-NEXT: vpblendvb %ymm3, 8(%ebp), %ymm1, %ymm1 # [6:2.00] +; KNL32-NEXT: movl %ebp, %esp # [1:0.25] +; KNL32-NEXT: popl %ebp # [4:0.50] +; KNL32-NEXT: retl # [1:1.00] entry: %0 = shufflevector <64 x i8> %A, <64 x i8> %W, <64 x i32> ret <64 x i8> %0 @@ -638,40 +652,42 @@ ; SKX64-LABEL: test_mm512_mask_blend_epi16: ; SKX64: # BB#0: # %entry ; SKX64-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; SKX64-NEXT: # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: test_mm512_mask_blend_epi16: ; KNL64: # BB#0: # %entry -; KNL64-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; KNL64-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7],ymm3[8],ymm1[9],ymm3[10],ymm1[11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] -; KNL64-NEXT: retq +; KNL64-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15][1:1.00] +; KNL64-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7],ymm3[8],ymm1[9],ymm3[10],ymm1[11],ymm3[12],ymm1[13],ymm3[14],ymm1[15][1:1.00] +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: test_mm512_mask_blend_epi16: ; SKX32: # BB#0: # %entry ; SKX32-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; SKX32-NEXT: # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: test_mm512_mask_blend_epi16: ; KNL32: # BB#0: # %entry -; KNL32-NEXT: pushl %ebp +; KNL32-NEXT: pushl %ebp # [1:1.00] ; KNL32-NEXT: .Lcfi3: ; KNL32-NEXT: .cfi_def_cfa_offset 8 ; KNL32-NEXT: .Lcfi4: ; KNL32-NEXT: .cfi_offset %ebp, -8 -; KNL32-NEXT: movl %esp, %ebp +; KNL32-NEXT: movl %esp, %ebp # [1:0.25] ; KNL32-NEXT: .Lcfi5: ; KNL32-NEXT: .cfi_def_cfa_register %ebp -; KNL32-NEXT: andl $-32, %esp -; KNL32-NEXT: subl $32, %esp -; KNL32-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; KNL32-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] -; KNL32-NEXT: movl %ebp, %esp -; KNL32-NEXT: popl %ebp -; KNL32-NEXT: retl +; KNL32-NEXT: andl $-32, %esp # [1:0.25] +; KNL32-NEXT: subl $32, %esp # [1:0.25] +; KNL32-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15][1:1.00] +; KNL32-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15][4:1.00] +; KNL32-NEXT: movl %ebp, %esp # [1:0.25] +; KNL32-NEXT: popl %ebp # [4:0.50] +; KNL32-NEXT: retl # [1:1.00] entry: %0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32> ret <32 x i16> %0 @@ -681,30 +697,34 @@ ; SKX64-LABEL: test_mm512_mask_blend_epi32: ; SKX64: # BB#0: # %entry ; SKX64-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX64-NEXT: # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: test_mm512_mask_blend_epi32: ; KNL64: # BB#0: # %entry ; KNL64-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL64-NEXT: # [1:0.25] ; KNL64-NEXT: kmovw %eax, %k1 ; KNL64-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} -; KNL64-NEXT: retq +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: test_mm512_mask_blend_epi32: ; SKX32: # BB#0: # %entry ; SKX32-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX32-NEXT: # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: test_mm512_mask_blend_epi32: ; KNL32: # BB#0: # %entry ; KNL32-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL32-NEXT: # [1:0.25] ; KNL32-NEXT: kmovw %eax, %k1 ; KNL32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} -; KNL32-NEXT: retl +; KNL32-NEXT: retl # [1:1.00] entry: %0 = shufflevector <16 x i32> %A, <16 x i32> %W, <16 x i32> ret <16 x i32> %0 @@ -713,31 +733,31 @@ define <8 x i64> @test_mm512_mask_blend_epi64(<8 x i64> %A, <8 x i64> %W){ ; SKX64-LABEL: test_mm512_mask_blend_epi64: ; SKX64: # BB#0: # %entry -; SKX64-NEXT: movb $-86, %al +; SKX64-NEXT: movb $-86, %al # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: test_mm512_mask_blend_epi64: ; KNL64: # BB#0: # %entry -; KNL64-NEXT: movb $-86, %al +; KNL64-NEXT: movb $-86, %al # [1:0.25] ; KNL64-NEXT: kmovw %eax, %k1 ; KNL64-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} -; KNL64-NEXT: retq +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: test_mm512_mask_blend_epi64: ; SKX32: # BB#0: # %entry -; SKX32-NEXT: movb $-86, %al +; SKX32-NEXT: movb $-86, %al # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: test_mm512_mask_blend_epi64: ; KNL32: # BB#0: # %entry -; KNL32-NEXT: movb $-86, %al +; KNL32-NEXT: movb $-86, %al # [1:0.25] ; KNL32-NEXT: kmovw %eax, %k1 ; KNL32-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} -; KNL32-NEXT: retl +; KNL32-NEXT: retl # [1:1.00] entry: %0 = shufflevector <8 x i64> %A, <8 x i64> %W, <8 x i32> ret <8 x i64> %0 @@ -747,30 +767,34 @@ ; SKX64-LABEL: test_mm512_mask_blend_ps: ; SKX64: # BB#0: # %entry ; SKX64-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX64-NEXT: # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: test_mm512_mask_blend_ps: ; KNL64: # BB#0: # %entry ; KNL64-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL64-NEXT: # [1:0.25] ; KNL64-NEXT: kmovw %eax, %k1 ; KNL64-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} -; KNL64-NEXT: retq +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: test_mm512_mask_blend_ps: ; SKX32: # BB#0: # %entry ; SKX32-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX32-NEXT: # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: test_mm512_mask_blend_ps: ; KNL32: # BB#0: # %entry ; KNL32-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL32-NEXT: # [1:0.25] ; KNL32-NEXT: kmovw %eax, %k1 ; KNL32-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} -; KNL32-NEXT: retl +; KNL32-NEXT: retl # [1:1.00] entry: %0 = shufflevector <16 x float> %A, <16 x float> %W, <16 x i32> ret <16 x float> %0 @@ -779,31 +803,31 @@ define <8 x double> @test_mm512_mask_blend_pd(<8 x double> %A, <8 x double> %W){ ; SKX64-LABEL: test_mm512_mask_blend_pd: ; SKX64: # BB#0: # %entry -; SKX64-NEXT: movb $-88, %al +; SKX64-NEXT: movb $-88, %al # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: test_mm512_mask_blend_pd: ; KNL64: # BB#0: # %entry -; KNL64-NEXT: movb $-88, %al +; KNL64-NEXT: movb $-88, %al # [1:0.25] ; KNL64-NEXT: kmovw %eax, %k1 ; KNL64-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} -; KNL64-NEXT: retq +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: test_mm512_mask_blend_pd: ; SKX32: # BB#0: # %entry -; SKX32-NEXT: movb $-88, %al +; SKX32-NEXT: movb $-88, %al # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: test_mm512_mask_blend_pd: ; KNL32: # BB#0: # %entry -; KNL32-NEXT: movb $-88, %al +; KNL32-NEXT: movb $-88, %al # [1:0.25] ; KNL32-NEXT: kmovw %eax, %k1 ; KNL32-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} -; KNL32-NEXT: retl +; KNL32-NEXT: retl # [1:1.00] entry: %0 = shufflevector <8 x double> %A, <8 x double> %W, <8 x i32> ret <8 x double> %0 @@ -814,28 +838,30 @@ ; SKX64-LABEL: test_mm256_mask_blend_epi8: ; SKX64: # BB#0: # %entry ; SKX64-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; SKX64-NEXT: # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: test_mm256_mask_blend_epi8: ; KNL64: # BB#0: # %entry -; KNL64-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; KNL64-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; KNL64-NEXT: retq +; KNL64-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; KNL64-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # [2:2.00] +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: test_mm256_mask_blend_epi8: ; SKX32: # BB#0: # %entry ; SKX32-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; SKX32-NEXT: # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: test_mm256_mask_blend_epi8: ; KNL32: # BB#0: # %entry -; KNL32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; KNL32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; KNL32-NEXT: retl +; KNL32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; KNL32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # [2:2.00] +; KNL32-NEXT: retl # [1:1.00] entry: %0 = shufflevector <32 x i8> %A, <32 x i8> %W, <32 x i32> ret <32 x i8> %0 @@ -845,28 +871,30 @@ ; SKX64-LABEL: test_mm_mask_blend_epi8: ; SKX64: # BB#0: # %entry ; SKX64-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX64-NEXT: # [1:0.25] ; SKX64-NEXT: kmovd %eax, %k1 ; SKX64-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} -; SKX64-NEXT: retq +; SKX64-NEXT: retq # [1:1.00] ; ; KNL64-LABEL: test_mm_mask_blend_epi8: ; KNL64: # BB#0: # %entry -; KNL64-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; KNL64-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; KNL64-NEXT: retq +; KNL64-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; KNL64-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # [2:2.00] +; KNL64-NEXT: retq # [1:1.00] ; ; SKX32-LABEL: test_mm_mask_blend_epi8: ; SKX32: # BB#0: # %entry ; SKX32-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX32-NEXT: # [1:0.25] ; SKX32-NEXT: kmovd %eax, %k1 ; SKX32-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} -; SKX32-NEXT: retl +; SKX32-NEXT: retl # [1:1.00] ; ; KNL32-LABEL: test_mm_mask_blend_epi8: ; KNL32: # BB#0: # %entry -; KNL32-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; KNL32-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; KNL32-NEXT: retl +; KNL32-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0][4:0.50] +; KNL32-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # [2:2.00] +; KNL32-NEXT: retl # [1:1.00] entry: %0 = shufflevector <16 x i8> %A, <16 x i8> %W, <16 x i32> ret <16 x i8> %0 Index: test/CodeGen/X86/xaluo.ll =================================================================== --- test/CodeGen/X86/xaluo.ll +++ test/CodeGen/X86/xaluo.ll @@ -26,10 +26,10 @@ ; ; KNL-LABEL: saddoi8: ; KNL: ## BB#0: -; KNL-NEXT: addb %sil, %dil -; KNL-NEXT: seto %al -; KNL-NEXT: movb %dil, (%rdx) -; KNL-NEXT: retq +; KNL-NEXT: addb %sil, %dil ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movb %dil, (%rdx) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 %v2) %val = extractvalue {i8, i1} %t, 0 %obit = extractvalue {i8, i1} %t, 1 @@ -56,10 +56,10 @@ ; ; KNL-LABEL: saddoi16: ; KNL: ## BB#0: -; KNL-NEXT: addw %si, %di -; KNL-NEXT: seto %al -; KNL-NEXT: movw %di, (%rdx) -; KNL-NEXT: retq +; KNL-NEXT: addw %si, %di ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movw %di, (%rdx) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 %v2) %val = extractvalue {i16, i1} %t, 0 %obit = extractvalue {i16, i1} %t, 1 @@ -86,10 +86,10 @@ ; ; KNL-LABEL: saddoi32: ; KNL: ## BB#0: -; KNL-NEXT: addl %esi, %edi -; KNL-NEXT: seto %al -; KNL-NEXT: movl %edi, (%rdx) -; KNL-NEXT: retq +; KNL-NEXT: addl %esi, %edi ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movl %edi, (%rdx) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -116,10 +116,10 @@ ; ; KNL-LABEL: saddoi64: ; KNL: ## BB#0: -; KNL-NEXT: addq %rsi, %rdi -; KNL-NEXT: seto %al -; KNL-NEXT: movq %rdi, (%rdx) -; KNL-NEXT: retq +; KNL-NEXT: addq %rsi, %rdi ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movq %rdi, (%rdx) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -147,10 +147,10 @@ ; ; KNL-LABEL: saddoinci8: ; KNL: ## BB#0: -; KNL-NEXT: incb %dil -; KNL-NEXT: seto %al -; KNL-NEXT: movb %dil, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: incb %dil ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movb %dil, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 1) %val = extractvalue {i8, i1} %t, 0 %obit = extractvalue {i8, i1} %t, 1 @@ -177,10 +177,10 @@ ; ; KNL-LABEL: saddoinci16: ; KNL: ## BB#0: -; KNL-NEXT: incw %di -; KNL-NEXT: seto %al -; KNL-NEXT: movw %di, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: incw %di ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movw %di, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 1) %val = extractvalue {i16, i1} %t, 0 %obit = extractvalue {i16, i1} %t, 1 @@ -207,10 +207,10 @@ ; ; KNL-LABEL: saddoinci32: ; KNL: ## BB#0: -; KNL-NEXT: incl %edi -; KNL-NEXT: seto %al -; KNL-NEXT: movl %edi, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: incl %edi ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movl %edi, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 1) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -237,10 +237,10 @@ ; ; KNL-LABEL: saddoinci64: ; KNL: ## BB#0: -; KNL-NEXT: incq %rdi -; KNL-NEXT: seto %al -; KNL-NEXT: movq %rdi, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: incq %rdi ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movq %rdi, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 1) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -270,11 +270,11 @@ ; ; KNL-LABEL: saddoi64imm1: ; KNL: ## BB#0: -; KNL-NEXT: movl $2, %ecx -; KNL-NEXT: addq %rdi, %rcx -; KNL-NEXT: seto %al -; KNL-NEXT: movq %rcx, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: movl $2, %ecx ## [1:0.25] +; KNL-NEXT: addq %rdi, %rcx ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movq %rcx, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 2, i64 %v1) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -303,9 +303,10 @@ ; KNL-LABEL: saddoi64imm2: ; KNL: ## BB#0: ; KNL-NEXT: addq $-2147483648, %rdi ## imm = 0x80000000 -; KNL-NEXT: seto %al -; KNL-NEXT: movq %rdi, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movq %rdi, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -2147483648) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -335,10 +336,11 @@ ; KNL-LABEL: saddoi64imm3: ; KNL: ## BB#0: ; KNL-NEXT: movabsq $-21474836489, %rcx ## imm = 0xFFFFFFFAFFFFFFF7 -; KNL-NEXT: addq %rdi, %rcx -; KNL-NEXT: seto %al -; KNL-NEXT: movq %rcx, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: ## [1:0.25] +; KNL-NEXT: addq %rdi, %rcx ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movq %rcx, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -21474836489) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -366,9 +368,10 @@ ; KNL-LABEL: saddoi64imm4: ; KNL: ## BB#0: ; KNL-NEXT: addq $2147483647, %rdi ## imm = 0x7FFFFFFF -; KNL-NEXT: seto %al -; KNL-NEXT: movq %rdi, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movq %rdi, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483647) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -398,10 +401,11 @@ ; KNL-LABEL: saddoi64imm5: ; KNL: ## BB#0: ; KNL-NEXT: movl $2147483648, %ecx ## imm = 0x80000000 -; KNL-NEXT: addq %rdi, %rcx -; KNL-NEXT: seto %al -; KNL-NEXT: movq %rcx, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: ## [1:0.25] +; KNL-NEXT: addq %rdi, %rcx ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movq %rcx, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483648) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -429,10 +433,10 @@ ; ; KNL-LABEL: uaddoi32: ; KNL: ## BB#0: -; KNL-NEXT: addl %esi, %edi -; KNL-NEXT: setb %al -; KNL-NEXT: movl %edi, (%rdx) -; KNL-NEXT: retq +; KNL-NEXT: addl %esi, %edi ## [1:0.25] +; KNL-NEXT: setb %al ## [1:0.50] +; KNL-NEXT: movl %edi, (%rdx) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -459,10 +463,10 @@ ; ; KNL-LABEL: uaddoi64: ; KNL: ## BB#0: -; KNL-NEXT: addq %rsi, %rdi -; KNL-NEXT: setb %al -; KNL-NEXT: movq %rdi, (%rdx) -; KNL-NEXT: retq +; KNL-NEXT: addq %rsi, %rdi ## [1:0.25] +; KNL-NEXT: setb %al ## [1:0.50] +; KNL-NEXT: movq %rdi, (%rdx) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -490,10 +494,10 @@ ; ; KNL-LABEL: uaddoinci8: ; KNL: ## BB#0: -; KNL-NEXT: addb $1, %dil -; KNL-NEXT: setb %al -; KNL-NEXT: movb %dil, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: addb $1, %dil ## [1:0.25] +; KNL-NEXT: setb %al ## [1:0.50] +; KNL-NEXT: movb %dil, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %v1, i8 1) %val = extractvalue {i8, i1} %t, 0 %obit = extractvalue {i8, i1} %t, 1 @@ -520,10 +524,10 @@ ; ; KNL-LABEL: uaddoinci16: ; KNL: ## BB#0: -; KNL-NEXT: addw $1, %di -; KNL-NEXT: setb %al -; KNL-NEXT: movw %di, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: addw $1, %di ## [1:0.25] +; KNL-NEXT: setb %al ## [1:0.50] +; KNL-NEXT: movw %di, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %v1, i16 1) %val = extractvalue {i16, i1} %t, 0 %obit = extractvalue {i16, i1} %t, 1 @@ -550,10 +554,10 @@ ; ; KNL-LABEL: uaddoinci32: ; KNL: ## BB#0: -; KNL-NEXT: addl $1, %edi -; KNL-NEXT: setb %al -; KNL-NEXT: movl %edi, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: addl $1, %edi ## [1:0.25] +; KNL-NEXT: setb %al ## [1:0.50] +; KNL-NEXT: movl %edi, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 1) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -580,10 +584,10 @@ ; ; KNL-LABEL: uaddoinci64: ; KNL: ## BB#0: -; KNL-NEXT: addq $1, %rdi -; KNL-NEXT: setb %al -; KNL-NEXT: movq %rdi, (%rsi) -; KNL-NEXT: retq +; KNL-NEXT: addq $1, %rdi ## [1:0.25] +; KNL-NEXT: setb %al ## [1:0.50] +; KNL-NEXT: movq %rdi, (%rsi) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 1) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -611,10 +615,10 @@ ; ; KNL-LABEL: ssuboi32: ; KNL: ## BB#0: -; KNL-NEXT: subl %esi, %edi -; KNL-NEXT: seto %al -; KNL-NEXT: movl %edi, (%rdx) -; KNL-NEXT: retq +; KNL-NEXT: subl %esi, %edi ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movl %edi, (%rdx) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -641,10 +645,10 @@ ; ; KNL-LABEL: ssuboi64: ; KNL: ## BB#0: -; KNL-NEXT: subq %rsi, %rdi -; KNL-NEXT: seto %al -; KNL-NEXT: movq %rdi, (%rdx) -; KNL-NEXT: retq +; KNL-NEXT: subq %rsi, %rdi ## [1:0.25] +; KNL-NEXT: seto %al ## [1:0.50] +; KNL-NEXT: movq %rdi, (%rdx) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -672,10 +676,10 @@ ; ; KNL-LABEL: usuboi32: ; KNL: ## BB#0: -; KNL-NEXT: subl %esi, %edi -; KNL-NEXT: setb %al -; KNL-NEXT: movl %edi, (%rdx) -; KNL-NEXT: retq +; KNL-NEXT: subl %esi, %edi ## [1:0.25] +; KNL-NEXT: setb %al ## [1:0.50] +; KNL-NEXT: movl %edi, (%rdx) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -702,10 +706,10 @@ ; ; KNL-LABEL: usuboi64: ; KNL: ## BB#0: -; KNL-NEXT: subq %rsi, %rdi -; KNL-NEXT: setb %al -; KNL-NEXT: movq %rdi, (%rdx) -; KNL-NEXT: retq +; KNL-NEXT: subq %rsi, %rdi ## [1:0.25] +; KNL-NEXT: setb %al ## [1:0.50] +; KNL-NEXT: movq %rdi, (%rdx) ## [1:1.00] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -735,11 +739,11 @@ ; ; KNL-LABEL: saddoselecti32: ; KNL: ## BB#0: -; KNL-NEXT: movl %edi, %eax -; KNL-NEXT: addl %esi, %eax -; KNL-NEXT: cmovol %edi, %esi -; KNL-NEXT: movl %esi, %eax -; KNL-NEXT: retq +; KNL-NEXT: movl %edi, %eax ## [1:0.25] +; KNL-NEXT: addl %esi, %eax ## [1:0.25] +; KNL-NEXT: cmovol %edi, %esi ## [2:0.50] +; KNL-NEXT: movl %esi, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -765,11 +769,11 @@ ; ; KNL-LABEL: saddoselecti64: ; KNL: ## BB#0: -; KNL-NEXT: movq %rdi, %rax -; KNL-NEXT: addq %rsi, %rax -; KNL-NEXT: cmovoq %rdi, %rsi -; KNL-NEXT: movq %rsi, %rax -; KNL-NEXT: retq +; KNL-NEXT: movq %rdi, %rax ## [1:0.25] +; KNL-NEXT: addq %rsi, %rax ## [1:0.25] +; KNL-NEXT: cmovoq %rdi, %rsi ## [2:0.50] +; KNL-NEXT: movq %rsi, %rax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -795,11 +799,11 @@ ; ; KNL-LABEL: uaddoselecti32: ; KNL: ## BB#0: -; KNL-NEXT: movl %edi, %eax -; KNL-NEXT: addl %esi, %eax -; KNL-NEXT: cmovbl %edi, %esi -; KNL-NEXT: movl %esi, %eax -; KNL-NEXT: retq +; KNL-NEXT: movl %edi, %eax ## [1:0.25] +; KNL-NEXT: addl %esi, %eax ## [1:0.25] +; KNL-NEXT: cmovbl %edi, %esi ## [2:0.50] +; KNL-NEXT: movl %esi, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -825,11 +829,11 @@ ; ; KNL-LABEL: uaddoselecti64: ; KNL: ## BB#0: -; KNL-NEXT: movq %rdi, %rax -; KNL-NEXT: addq %rsi, %rax -; KNL-NEXT: cmovbq %rdi, %rsi -; KNL-NEXT: movq %rsi, %rax -; KNL-NEXT: retq +; KNL-NEXT: movq %rdi, %rax ## [1:0.25] +; KNL-NEXT: addq %rsi, %rax ## [1:0.25] +; KNL-NEXT: cmovbq %rdi, %rsi ## [2:0.50] +; KNL-NEXT: movq %rsi, %rax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -853,10 +857,10 @@ ; ; KNL-LABEL: ssuboselecti32: ; KNL: ## BB#0: -; KNL-NEXT: cmpl %esi, %edi -; KNL-NEXT: cmovol %edi, %esi -; KNL-NEXT: movl %esi, %eax -; KNL-NEXT: retq +; KNL-NEXT: cmpl %esi, %edi ## [1:0.25] +; KNL-NEXT: cmovol %edi, %esi ## [2:0.50] +; KNL-NEXT: movl %esi, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -880,10 +884,10 @@ ; ; KNL-LABEL: ssuboselecti64: ; KNL: ## BB#0: -; KNL-NEXT: cmpq %rsi, %rdi -; KNL-NEXT: cmovoq %rdi, %rsi -; KNL-NEXT: movq %rsi, %rax -; KNL-NEXT: retq +; KNL-NEXT: cmpq %rsi, %rdi ## [1:0.25] +; KNL-NEXT: cmovoq %rdi, %rsi ## [2:0.50] +; KNL-NEXT: movq %rsi, %rax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -907,10 +911,10 @@ ; ; KNL-LABEL: usuboselecti32: ; KNL: ## BB#0: -; KNL-NEXT: cmpl %esi, %edi -; KNL-NEXT: cmovbl %edi, %esi -; KNL-NEXT: movl %esi, %eax -; KNL-NEXT: retq +; KNL-NEXT: cmpl %esi, %edi ## [1:0.25] +; KNL-NEXT: cmovbl %edi, %esi ## [2:0.50] +; KNL-NEXT: movl %esi, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -934,10 +938,10 @@ ; ; KNL-LABEL: usuboselecti64: ; KNL: ## BB#0: -; KNL-NEXT: cmpq %rsi, %rdi -; KNL-NEXT: cmovbq %rdi, %rsi -; KNL-NEXT: movq %rsi, %rax -; KNL-NEXT: retq +; KNL-NEXT: cmpq %rsi, %rdi ## [1:0.25] +; KNL-NEXT: cmovbq %rdi, %rsi ## [2:0.50] +; KNL-NEXT: movq %rsi, %rax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -976,14 +980,14 @@ ; ; KNL-LABEL: saddobri32: ; KNL: ## BB#0: -; KNL-NEXT: addl %esi, %edi -; KNL-NEXT: jo LBB31_1 +; KNL-NEXT: addl %esi, %edi ## [1:0.25] +; KNL-NEXT: jo LBB31_1 ## [1:0.50] ; KNL-NEXT: ## BB#2: ## %continue -; KNL-NEXT: movb $1, %al -; KNL-NEXT: retq +; KNL-NEXT: movb $1, %al ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; KNL-NEXT: LBB31_1: ## %overflow -; KNL-NEXT: xorl %eax, %eax -; KNL-NEXT: retq +; KNL-NEXT: xorl %eax, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -1025,14 +1029,14 @@ ; ; KNL-LABEL: saddobri64: ; KNL: ## BB#0: -; KNL-NEXT: addq %rsi, %rdi -; KNL-NEXT: jo LBB32_1 +; KNL-NEXT: addq %rsi, %rdi ## [1:0.25] +; KNL-NEXT: jo LBB32_1 ## [1:0.50] ; KNL-NEXT: ## BB#2: ## %continue -; KNL-NEXT: movb $1, %al -; KNL-NEXT: retq +; KNL-NEXT: movb $1, %al ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; KNL-NEXT: LBB32_1: ## %overflow -; KNL-NEXT: xorl %eax, %eax -; KNL-NEXT: retq +; KNL-NEXT: xorl %eax, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -1074,14 +1078,14 @@ ; ; KNL-LABEL: uaddobri32: ; KNL: ## BB#0: -; KNL-NEXT: addl %esi, %edi -; KNL-NEXT: jb LBB33_1 +; KNL-NEXT: addl %esi, %edi ## [1:0.25] +; KNL-NEXT: jb LBB33_1 ## [1:0.50] ; KNL-NEXT: ## BB#2: ## %continue -; KNL-NEXT: movb $1, %al -; KNL-NEXT: retq +; KNL-NEXT: movb $1, %al ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; KNL-NEXT: LBB33_1: ## %overflow -; KNL-NEXT: xorl %eax, %eax -; KNL-NEXT: retq +; KNL-NEXT: xorl %eax, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -1123,14 +1127,14 @@ ; ; KNL-LABEL: uaddobri64: ; KNL: ## BB#0: -; KNL-NEXT: addq %rsi, %rdi -; KNL-NEXT: jb LBB34_1 +; KNL-NEXT: addq %rsi, %rdi ## [1:0.25] +; KNL-NEXT: jb LBB34_1 ## [1:0.50] ; KNL-NEXT: ## BB#2: ## %continue -; KNL-NEXT: movb $1, %al -; KNL-NEXT: retq +; KNL-NEXT: movb $1, %al ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; KNL-NEXT: LBB34_1: ## %overflow -; KNL-NEXT: xorl %eax, %eax -; KNL-NEXT: retq +; KNL-NEXT: xorl %eax, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -1172,14 +1176,14 @@ ; ; KNL-LABEL: ssubobri32: ; KNL: ## BB#0: -; KNL-NEXT: cmpl %esi, %edi -; KNL-NEXT: jo LBB35_1 +; KNL-NEXT: cmpl %esi, %edi ## [1:0.25] +; KNL-NEXT: jo LBB35_1 ## [1:0.50] ; KNL-NEXT: ## BB#2: ## %continue -; KNL-NEXT: movb $1, %al -; KNL-NEXT: retq +; KNL-NEXT: movb $1, %al ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; KNL-NEXT: LBB35_1: ## %overflow -; KNL-NEXT: xorl %eax, %eax -; KNL-NEXT: retq +; KNL-NEXT: xorl %eax, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -1221,14 +1225,14 @@ ; ; KNL-LABEL: ssubobri64: ; KNL: ## BB#0: -; KNL-NEXT: cmpq %rsi, %rdi -; KNL-NEXT: jo LBB36_1 +; KNL-NEXT: cmpq %rsi, %rdi ## [1:0.25] +; KNL-NEXT: jo LBB36_1 ## [1:0.50] ; KNL-NEXT: ## BB#2: ## %continue -; KNL-NEXT: movb $1, %al -; KNL-NEXT: retq +; KNL-NEXT: movb $1, %al ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; KNL-NEXT: LBB36_1: ## %overflow -; KNL-NEXT: xorl %eax, %eax -; KNL-NEXT: retq +; KNL-NEXT: xorl %eax, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -1270,14 +1274,14 @@ ; ; KNL-LABEL: usubobri32: ; KNL: ## BB#0: -; KNL-NEXT: cmpl %esi, %edi -; KNL-NEXT: jb LBB37_1 +; KNL-NEXT: cmpl %esi, %edi ## [1:0.25] +; KNL-NEXT: jb LBB37_1 ## [1:0.50] ; KNL-NEXT: ## BB#2: ## %continue -; KNL-NEXT: movb $1, %al -; KNL-NEXT: retq +; KNL-NEXT: movb $1, %al ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; KNL-NEXT: LBB37_1: ## %overflow -; KNL-NEXT: xorl %eax, %eax -; KNL-NEXT: retq +; KNL-NEXT: xorl %eax, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -1319,14 +1323,14 @@ ; ; KNL-LABEL: usubobri64: ; KNL: ## BB#0: -; KNL-NEXT: cmpq %rsi, %rdi -; KNL-NEXT: jb LBB38_1 +; KNL-NEXT: cmpq %rsi, %rdi ## [1:0.25] +; KNL-NEXT: jb LBB38_1 ## [1:0.50] ; KNL-NEXT: ## BB#2: ## %continue -; KNL-NEXT: movb $1, %al -; KNL-NEXT: retq +; KNL-NEXT: movb $1, %al ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] ; KNL-NEXT: LBB38_1: ## %overflow -; KNL-NEXT: xorl %eax, %eax -; KNL-NEXT: retq +; KNL-NEXT: xorl %eax, %eax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -1358,11 +1362,11 @@ ; ; KNL-LABEL: uaddoovf: ; KNL: ## BB#0: -; KNL-NEXT: movzbl %dil, %ecx -; KNL-NEXT: movzbl %sil, %eax -; KNL-NEXT: addq %rcx, %rax -; KNL-NEXT: xorl %edx, %edx -; KNL-NEXT: retq +; KNL-NEXT: movzbl %dil, %ecx ## [1:0.25] +; KNL-NEXT: movzbl %sil, %eax ## [1:0.25] +; KNL-NEXT: addq %rcx, %rax ## [1:0.25] +; KNL-NEXT: xorl %edx, %edx ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %1 = and i64 %a, 255 %2 = and i64 %b, 255 %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %1, i64 %2) @@ -1386,10 +1390,10 @@ ; ; KNL-LABEL: usuboovf: ; KNL: ## BB#0: -; KNL-NEXT: notq %rsi -; KNL-NEXT: xorl %edx, %edx -; KNL-NEXT: movq %rsi, %rax -; KNL-NEXT: retq +; KNL-NEXT: notq %rsi ## [1:0.25] +; KNL-NEXT: xorl %edx, %edx ## [1:0.25] +; KNL-NEXT: movq %rsi, %rax ## [1:0.25] +; KNL-NEXT: retq ## [1:1.00] %t0 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %a) %v0 = extractvalue {i64, i1} %t0, 0 %o0 = extractvalue {i64, i1} %t0, 1