Index: lib/Target/X86/CMakeLists.txt =================================================================== --- lib/Target/X86/CMakeLists.txt +++ lib/Target/X86/CMakeLists.txt @@ -21,6 +21,7 @@ X86CallFrameOptimization.cpp X86CallLowering.cpp X86CmovConversion.cpp + X86DomainReassignment.cpp X86ExpandPseudo.cpp X86FastISel.cpp X86FixupBWInsts.cpp Index: lib/Target/X86/X86.h =================================================================== --- lib/Target/X86/X86.h +++ lib/Target/X86/X86.h @@ -92,6 +92,10 @@ /// the upper portions of registers, and to save code size. FunctionPass *createX86FixupBWInsts(); +/// Return a Machine IR pass that reassigns instruction chains from one domain +/// to another, when profitable. +FunctionPass *createX86DomainReassignmentPass(); + void initializeFixupBWInstPassPass(PassRegistry &); /// This pass replaces EVEX ecnoded of AVX-512 instructiosn by VEX Index: lib/Target/X86/X86DomainReassignment.cpp =================================================================== --- /dev/null +++ lib/Target/X86/X86DomainReassignment.cpp @@ -0,0 +1,692 @@ +//===--- X86DomainReassignment.cpp - Selectively switch register classes---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass attempts to find instruction chains (closures) in one domain, +// and convert them to equivalent instructions in a different domain, +// if profitable. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-domain-reassignment" + +STATISTIC(NumClosuresConverted, "Number of closures converted by the pass"); + +static cl::opt DisableX86DomainReassignment( + "disable-x86-domain-reassignment", cl::Hidden, + cl::desc("X86: Disable Virtual Register Reassignment."), cl::init(false)); + +namespace { +enum RegDomain { NoDomain = -1, GPRDomain, MaskDomain, OtherDomain }; + +static bool isGPR(const TargetRegisterClass *RC) { + return RC == &X86::GR8RegClass || RC == &X86::GR16RegClass || + RC == &X86::GR32RegClass || RC == &X86::GR64RegClass; +} + +static bool isMask(const TargetRegisterClass *RC) { + return RC == &X86::VK1RegClass || RC == &X86::VK2RegClass || + RC == &X86::VK4RegClass || RC == &X86::VK8RegClass || + RC == &X86::VK16RegClass || RC == &X86::VK32RegClass || + RC == &X86::VK1WMRegClass || RC == &X86::VK2WMRegClass || + RC == &X86::VK4WMRegClass || RC == &X86::VK8WMRegClass || + RC == &X86::VK16WMRegClass || RC == &X86::VK32WMRegClass || + RC == &X86::VK64RegClass || RC == &X86::VK64WMRegClass; +} + +static RegDomain getDomain(const TargetRegisterClass *RC) { + if (isGPR(RC)) + return GPRDomain; + if (isMask(RC)) + return MaskDomain; + return OtherDomain; +} + +/// Return a register claass equivalent to \p SrcRC, in \p Domain. +static const TargetRegisterClass *getDstRC(const TargetRegisterClass *SrcRC, + RegDomain Domain) { + assert(Domain == MaskDomain && "add domain"); + if (SrcRC == &X86::GR8RegClass) + return &X86::VK8RegClass; + if (SrcRC == &X86::GR16RegClass) + return &X86::VK16RegClass; + if (SrcRC == &X86::GR32RegClass) + return &X86::VK32RegClass; + if (SrcRC == &X86::GR64RegClass) + return &X86::VK64RegClass; + assert(false && "add register class"); + return nullptr; +} + +/// Abstract Instruction Converter class. +class InstrConverter { +public: + virtual ~InstrConverter() {} + + /// \returns true if \p MI is legal to convert. + virtual bool isLegal(const MachineInstr *MI) const = 0; + + /// Applies conversion to \p MI. + /// + /// \returns true if \p MI is no longer need, and can be deleted. + virtual bool ConvertInstr(MachineInstr *MI) const = 0; + + /// \returns the cost increment incurred by converting \p MI. + virtual double getCostIncr(const MachineInstr *MI) const = 0; +}; + +/// An Instruction Converter which ignores the given instruction. +class InstrIgnore : public InstrConverter { +public: + virtual bool isLegal(const MachineInstr *MI) const override { return true; } + + virtual bool ConvertInstr(MachineInstr *MI) const override { return false; } + + virtual double getCostIncr(const MachineInstr *MI) const override { + return 0; + } +}; + +/// An Instruction Converter which replaces an instruction with another. +class InstrReplacer : public InstrConverter { +public: + /// Opcode of the destination instruction. + unsigned DstOpcode; + + const TargetInstrInfo *TII; + + InstrReplacer(unsigned DstOpcode, const TargetInstrInfo *TII) + : DstOpcode(DstOpcode), TII(TII) {} + + virtual bool isLegal(const MachineInstr *MI) const override { + // It's illegal to replace an instruction that implicitly defines a register + // with an instruction that doesn't, unless it's dead. + for (auto &MO : MI->implicit_operands()) + if (MO.isReg() && MO.isDef() && !MO.isDead() && + !TII->get(DstOpcode).hasImplicitDefOfPhysReg(MO.getReg())) + return false; + return true; + } + + virtual bool ConvertInstr(MachineInstr *MI) const override { + MachineInstrBuilder Bld = + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(DstOpcode)); + for (auto &Op : MI->explicit_operands()) + Bld.add(Op); + return true; + } + + virtual double getCostIncr(const MachineInstr *MI) const override { + // Assuming instructions have the same cost. + return 0; + } +}; + +/// An Instruction Converter which replaces an instruction with another, and +/// adds a COPY from the new instruction's destination to the old one's. +class InstrReplacerDstCOPY : public InstrConverter { +public: + unsigned DstOpcode; + + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI; + + InstrReplacerDstCOPY(unsigned DstOpcode, const TargetInstrInfo *TII, + MachineRegisterInfo *MRI) + : DstOpcode(DstOpcode), TII(TII), MRI(MRI) {} + + virtual bool isLegal(const MachineInstr *MI) const override { return true; } + + virtual bool ConvertInstr(MachineInstr *MI) const override { + MachineBasicBlock *MBB = MI->getParent(); + auto &DL = MI->getDebugLoc(); + + unsigned Reg = MRI->createVirtualRegister( + TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(), + *MBB->getParent())); + MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg); + for (unsigned Idx = 1, End = MI->getNumOperands(); Idx < End; ++Idx) + Bld.add(MI->getOperand(Idx)); + + BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY)) + .add(MI->getOperand(0)) + .addReg(Reg); + + return true; + } + + virtual double getCostIncr(const MachineInstr *MI) const override { + // Assuming instructions have the same cost, and that COPY is in the same + // domain so it will be eliminated. + return 0; + } +}; + +/// An Instruction Converter for replacing COPY instructions. +class InstrCOPYReplacer : public InstrReplacer { +public: + RegDomain DstDomain; + MachineRegisterInfo *MRI; + + InstrCOPYReplacer(RegDomain DstDomain, unsigned DstOpcode, + const TargetInstrInfo *TII, MachineRegisterInfo *MRI) + : InstrReplacer(DstOpcode, TII), DstDomain(DstDomain), MRI(MRI) {} + + virtual bool isLegal(const MachineInstr *MI) const override { return true; } + + virtual double getCostIncr(const MachineInstr *MI) const override { + assert(MI->getOpcode() == TargetOpcode::COPY && "Expected a COPY"); + + for (auto &MO : MI->operands()) { + if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + return 1; + + RegDomain OpDomain = getDomain(MRI->getRegClass(MO.getReg())); + // Converting a cross domain COPY to a same domain COPY should eliminate + // an insturction + if (OpDomain == DstDomain) + return -1; + } + return 0; + } +}; + +/// An Instruction Converter which replaces an instruction with a COPY. +class InstrReplaceWithCopy : public InstrConverter { +public: + // Source instruction operand Index, to be used as the COPY source. + unsigned SrcOpIdx; + + const TargetInstrInfo *TII; + + InstrReplaceWithCopy(unsigned SrcOpIdx, const TargetInstrInfo *TII) + : SrcOpIdx(SrcOpIdx), TII(TII) {} + + virtual bool isLegal(const MachineInstr *) const override { return true; } + + virtual bool ConvertInstr(MachineInstr *MI) const override { + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(TargetOpcode::COPY)) + .add({MI->getOperand(0), MI->getOperand(SrcOpIdx)}); + return true; + } + + virtual double getCostIncr(const MachineInstr *MI) const override { + return 0; + } +}; + +/// An Instruction Converter which completely deletes an instruction. +class InstrDeleter : public InstrConverter { +public: + virtual bool isLegal(const MachineInstr *MI) const override { return true; } + + virtual bool ConvertInstr(MachineInstr *MI) const override { return true; } + + virtual double getCostIncr(const MachineInstr *MI) const override { + return 0; + } +}; + +typedef std::pair InstrConverterKeyTy; + +typedef DenseMap InstrConverterMap; + +/// A closure is a set of virtual register representing all of the edges in +/// the closure, as well as all of the instructions connected by those edges. +/// +/// A closure may encompass virtual registers in the same register bank that +/// have different widths. For example, it may contain 32-bit GPRs as well as +/// 64-bit GPRs. +/// +/// A closure that computes an address (i.e. defines a virtual register that is +/// used in a memory operand) excludes the instructions that contain memory +/// operands using the address. Such an instruction will be included in a +/// different closure that manipulates the loaded or stored value. +class Closure { +private: + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI; + + /// Virtual registers in the closure. + DenseSet Edges; + + /// Instructions in the closure. + SmallVector Instrs; + + /// Virtual registers, candidates to add to the closure. + SmallVector Worklist; + + /// A map of available Instruction Converters. + const InstrConverterMap &Converters; + + /// The register domain of this closure. + RegDomain Domain; + + /// Domains which this closure can legally be reassigned to. + SmallVector LegalDstDomains; + + /// Enqueue \p Reg to be considered for addition to the closure. + void addToWorklist(unsigned Reg); + + /// Add \p MI to this closure. + void encloseInstr(MachineInstr *MI); + +public: + /// All edges that are included in some closure. + static DenseSet EnclosedEdges; + + /// All instructions that are included in some closure. + static DenseMap EnclosedInstrs; + + Closure(const TargetInstrInfo *TII, MachineRegisterInfo *MRI, + const InstrConverterMap &Converters, + const SmallVector LegalDstDomains); + + /// Starting from existing edges in this closure, expand the closure as much + /// as possible. + void buildClosure(unsigned E); + + /// Calculate the total cost of reassigning the closure to \p Domain. + double calculateCost(RegDomain Domain) const; + + /// Reassign the closure to \p Domain. + void Reassign(RegDomain Domain) const; + + /// Mark this closure as illegal for reassignment fo all domains. + void setAllIllegal() { LegalDstDomains.clear(); } + + /// Mark this closure as illegal for reassignment for domain \p RD + void setIllegal(RegDomain RD) { + auto I = std::find(LegalDstDomains.begin(), LegalDstDomains.end(), RD); + if (I != LegalDstDomains.end()) + LegalDstDomains.erase(I); + } + + /// \returns true if this closure has domains which are legal to reassign to. + bool hasLegalDstDomain() const { return !LegalDstDomains.empty(); } + + /// \returns true if is legal to reassign this closure to domain \p RD. + bool isLegal(RegDomain RD) const { + return std::find(LegalDstDomains.begin(), LegalDstDomains.end(), RD) != + LegalDstDomains.end(); + } + + bool empty() const { return Edges.empty(); } +}; + +DenseSet Closure::EnclosedEdges; +DenseMap Closure::EnclosedInstrs; + +class X86DomainReassignment : public MachineFunctionPass { +public: + static char ID; + + X86DomainReassignment() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return "X86 Domain Reassignment Pass"; + } + +private: + const X86Subtarget *STI; + MachineRegisterInfo *MRI; + const X86InstrInfo *TII; + + /// A map of available Instruction Converters. + InstrConverterMap Converters; + + /// Initialize Converters map. + void initConverters(); +}; + +char X86DomainReassignment::ID = 0; +} // End anonymous namespace. + +Closure::Closure(const TargetInstrInfo *TII, MachineRegisterInfo *MRI, + const InstrConverterMap &Converters, + const SmallVector LegalDstDomains) + : TII(TII), MRI(MRI), Converters(Converters), Domain(NoDomain), + LegalDstDomains(LegalDstDomains) {} + +void Closure::addToWorklist(unsigned Reg) { + if (EnclosedEdges.count(Reg)) + return; + + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return; + + if (!MRI->hasOneDef(Reg)) + return; + + RegDomain RD = getDomain(MRI->getRegClass(Reg)); + // First edge in closure sets the domain. + if (Domain == NoDomain) + Domain = RD; + + if (Domain != RD) + return; + + Worklist.push_back(Reg); +} + +void Closure::encloseInstr(MachineInstr *MI) { + auto I = EnclosedInstrs.find(MI); + if (EnclosedInstrs.find(MI) != EnclosedInstrs.end()) { + if (I->second != this) + setAllIllegal(); + return; + } + + EnclosedInstrs[MI] = this; + Instrs.push_back(MI); + + for (RegDomain D : LegalDstDomains) { + InstrConverter *IC = Converters.lookup({D, MI->getOpcode()}); + if (!IC || !IC->isLegal(MI)) + setIllegal(D); + } +} + +double Closure::calculateCost(RegDomain DstDomain) const { + assert(isLegal(DstDomain) && "Cannot calculate cost for illegal closure"); + + double Cost = 0.0; + for (auto MI : Instrs) + Cost += Converters.lookup({DstDomain, MI->getOpcode()})->getCostIncr(MI); + return Cost; +} + +void Closure::Reassign(RegDomain Domain) const { + assert(isLegal(Domain) && "Cannot convert illegal closure"); + + SmallVector ToErase; + for (auto MI : Instrs) + if (Converters.lookup({Domain, MI->getOpcode()})->ConvertInstr(MI)) + ToErase.push_back(MI); + + for (unsigned E : Edges) { + unsigned DestReg = + MRI->createVirtualRegister(getDstRC(MRI->getRegClass(E), Domain)); + MRI->replaceRegWith(E, DestReg); + for (auto &MO : MRI->use_operands(DestReg)) { + if (MO.isReg()) + MO.setSubReg(0); + } + } + + for (auto MI : ToErase) + MI->eraseFromParent(); +} + +/// \returns true when \p Reg is used as part of an address calculation in \p +/// MI. +static bool usedAsAddr(const MachineInstr &MI, unsigned Reg, + const TargetInstrInfo *TII) { + if (!MI.mayLoadOrStore()) + return false; + + const MCInstrDesc &Desc = TII->get(MI.getOpcode()); + int MemOpStart = X86II::getMemoryOperandNo(Desc.TSFlags); + if (MemOpStart == -1) + return false; + + MemOpStart += X86II::getOperandBias(Desc); + for (unsigned MemOpIdx = MemOpStart, + MemOpEnd = MemOpStart + X86::AddrNumOperands; + MemOpIdx < MemOpEnd; ++MemOpIdx) { + auto &Op = MI.getOperand(MemOpIdx); + if (Op.isReg() && Op.getReg() == Reg) + return true; + } + return false; +} + +void Closure::buildClosure(unsigned E) { + addToWorklist(E); + while (!Worklist.empty()) { + unsigned E = Worklist.pop_back_val(); + + // Register already in this closure. + if (!Edges.insert(E).second) + continue; + + MachineInstr *DefMI = MRI->getVRegDef(E); + encloseInstr(DefMI); + + // Add register used by the defining MI to the worklist. + // Do not add registers which are used in address calculation, they will be + // added to a different closure. + int OpEnd = DefMI->getNumOperands(); + const MCInstrDesc &Desc = DefMI->getDesc(); + int MemOp = X86II::getMemoryOperandNo(Desc.TSFlags); + if (MemOp != -1) + MemOp += X86II::getOperandBias(Desc); + for (int OpIdx = 0; OpIdx < OpEnd; ++OpIdx) { + if (OpIdx == MemOp) { + // skip address calculation. + OpIdx += (X86::AddrNumOperands - 1); + continue; + } + auto &Op = DefMI->getOperand(OpIdx); + if (!Op.isReg() || !Op.isUse()) + continue; + addToWorklist(Op.getReg()); + } + + // Expand closure through register uses. + for (auto &UseMI : MRI->use_instructions(E)) { + // We would like to avoid converting closures which calculare addresses, + // as this should remain in GPRs. + if (usedAsAddr(UseMI, E, TII)) { + setAllIllegal(); + continue; + } + encloseInstr(&UseMI); + + for (auto &DefOp : UseMI.defs()) { + if (!DefOp.isReg()) + continue; + + unsigned DefReg = DefOp.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(DefReg)) { + setAllIllegal(); + continue; + } + addToWorklist(DefReg); + } + } + } +} + +void X86DomainReassignment::initConverters() { + Converters[{MaskDomain, TargetOpcode::PHI}] = new InstrIgnore(); + + Converters[{MaskDomain, TargetOpcode::IMPLICIT_DEF}] = new InstrDeleter(); + + Converters[{MaskDomain, TargetOpcode::INSERT_SUBREG}] = + new InstrReplaceWithCopy(2, TII); + + Converters[{MaskDomain, TargetOpcode::COPY}] = + new InstrCOPYReplacer(MaskDomain, TargetOpcode::COPY, TII, MRI); + + auto createReplacerDstCOPY = [&](unsigned From, unsigned To) { + Converters[{MaskDomain, From}] = new InstrReplacerDstCOPY(To, TII, MRI); + }; + + createReplacerDstCOPY(X86::MOVZX32rm16, X86::KMOVWkm); + createReplacerDstCOPY(X86::MOVZX64rm16, X86::KMOVWkm); + + createReplacerDstCOPY(X86::MOVZX32rr16, X86::KMOVWkk); + createReplacerDstCOPY(X86::MOVZX64rr16, X86::KMOVWkk); + + if (STI->hasBWI()) { + createReplacerDstCOPY(X86::MOVZX16rm8, X86::KMOVBkm); + createReplacerDstCOPY(X86::MOVZX32rm8, X86::KMOVBkm); + createReplacerDstCOPY(X86::MOVZX64rm8, X86::KMOVBkm); + + createReplacerDstCOPY(X86::MOVZX16rr8, X86::KMOVBkk); + createReplacerDstCOPY(X86::MOVZX32rr8, X86::KMOVBkk); + createReplacerDstCOPY(X86::MOVZX64rr8, X86::KMOVBkk); + } + + auto createReplacer = [&](unsigned From, unsigned To) { + Converters[{MaskDomain, From}] = new InstrReplacer(To, TII); + }; + + createReplacer(X86::MOV16rm, X86::KMOVWkm); + createReplacer(X86::MOV16mr, X86::KMOVWmk); + createReplacer(X86::MOV16rr, X86::KMOVWkk); + createReplacer(X86::SHR16ri, X86::KSHIFTRWri); + createReplacer(X86::SHL16ri, X86::KSHIFTLWri); + createReplacer(X86::ADD16rr, X86::KADDWrr); + createReplacer(X86::NOT16r, X86::KNOTWrr); + createReplacer(X86::OR16rr, X86::KORWrr); + createReplacer(X86::AND16rr, X86::KANDWrr); + createReplacer(X86::XOR16rr, X86::KXORWrr); + createReplacer(X86::TEST16rr, X86::KTESTWrr); + + if (STI->hasBWI()) { + createReplacer(X86::MOV8rm, X86::KMOVBkm); + createReplacer(X86::MOV32rm, X86::KMOVDkm); + createReplacer(X86::MOV64rm, X86::KMOVQkm); + + createReplacer(X86::MOV8mr, X86::KMOVBmk); + createReplacer(X86::MOV32mr, X86::KMOVDmk); + createReplacer(X86::MOV64mr, X86::KMOVQmk); + + createReplacer(X86::MOV8rr, X86::KMOVBkk); + createReplacer(X86::MOV32rr, X86::KMOVDkk); + createReplacer(X86::MOV64rr, X86::KMOVQkk); + + createReplacer(X86::SHR8ri, X86::KSHIFTRBri); + createReplacer(X86::SHR32ri, X86::KSHIFTRDri); + createReplacer(X86::SHR64ri, X86::KSHIFTRQri); + + createReplacer(X86::SHL8ri, X86::KSHIFTLBri); + createReplacer(X86::SHL32ri, X86::KSHIFTLDri); + createReplacer(X86::SHL64ri, X86::KSHIFTLQri); + + createReplacer(X86::ADD8rr, X86::KADDBrr); + createReplacer(X86::ADD32rr, X86::KADDDrr); + createReplacer(X86::ADD64rr, X86::KADDQrr); + + createReplacer(X86::NOT8r, X86::KNOTBrr); + createReplacer(X86::NOT32r, X86::KNOTDrr); + createReplacer(X86::NOT64r, X86::KNOTQrr); + + createReplacer(X86::OR8rr, X86::KORBrr); + createReplacer(X86::OR32rr, X86::KORDrr); + createReplacer(X86::OR64rr, X86::KORQrr); + + createReplacer(X86::AND8rr, X86::KANDBrr); + createReplacer(X86::AND32rr, X86::KANDDrr); + createReplacer(X86::AND64rr, X86::KANDQrr); + + createReplacer(X86::ANDN32rr, X86::KANDNDrr); + createReplacer(X86::ANDN64rr, X86::KANDNQrr); + + createReplacer(X86::XOR8rr, X86::KXORBrr); + createReplacer(X86::XOR32rr, X86::KXORDrr); + createReplacer(X86::XOR64rr, X86::KXORQrr); + + createReplacer(X86::TEST8rr, X86::KTESTBrr); + createReplacer(X86::TEST32rr, X86::KTESTDrr); + createReplacer(X86::TEST64rr, X86::KTESTQrr); + } +} + +bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + if (DisableX86DomainReassignment) + return false; + + DEBUG(dbgs() << "***** Machine Function before Domain Reassignment *****\n"); + DEBUG(MF.print(dbgs())); + + STI = &MF.getSubtarget(); + // GPR->K is the only transformation currently supported, bail out early if no + // AVX512. + if (!STI->hasAVX512()) + return false; + + MRI = &MF.getRegInfo(); + TII = STI->getInstrInfo(); + initConverters(); + bool Changed = false; + + Closure::EnclosedEdges.clear(); + Closure::EnclosedInstrs.clear(); + + std::vector Closures; + + // Go over all virtual registers and calculate a closure. + for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(Idx); + + // GPR only current source domain supported. + if (!isGPR(MRI->getRegClass(Reg))) + continue; + + // Register already in closure. + if (Closure::EnclosedEdges.count(Reg)) + continue; + + // Calculate closure starting with Reg. + Closure C(TII, MRI, Converters, {MaskDomain}); + C.buildClosure(Reg); + + // Collect all closures that can potentially be converted. + if (!C.empty() && C.isLegal(MaskDomain)) + Closures.push_back(std::move(C)); + } + + for (Closure &C : Closures) + if (C.calculateCost(MaskDomain) < 0) { + C.Reassign(MaskDomain); + ++NumClosuresConverted; + Changed = true; + } + + for (auto I : Converters) + delete I.second; + + DEBUG(dbgs() << "***** Machine Function after Domain Reassignment *****\n"); + DEBUG(MF.print(dbgs())); + + return Changed; +} + +/// Returns an instance of the Domain Reassignment pass. +FunctionPass *llvm::createX86DomainReassignmentPass() { + return new X86DomainReassignment(); +} Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -312,6 +312,7 @@ bool addGlobalInstructionSelect() override; bool addILPOpts() override; bool addPreISel() override; + void addMachineSSAOptimization() override; void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreEmitPass() override; @@ -405,6 +406,10 @@ addPass(createX86WinAllocaExpander()); } +void X86PassConfig::addMachineSSAOptimization() { + addPass(createX86DomainReassignmentPass()); + TargetPassConfig::addMachineSSAOptimization(); +} void X86PassConfig::addPostRegAlloc() { addPass(createX86FloatingPointStackifierPass()); Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -391,11 +391,10 @@ ; ; SKX-LABEL: test16: ; SKX: ## BB#0: -; SKX-NEXT: movb (%rdi), %al -; SKX-NEXT: kmovd %esi, %k0 -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: vpmovm2d %k1, %zmm0 -; SKX-NEXT: vpmovm2d %k0, %zmm1 +; SKX-NEXT: kmovb (%rdi), %k0 +; SKX-NEXT: kmovd %esi, %k1 +; SKX-NEXT: vpmovm2d %k0, %zmm0 +; SKX-NEXT: vpmovm2d %k1, %zmm1 ; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15] ; SKX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; SKX-NEXT: vpmovd2m %zmm2, %k0 @@ -428,11 +427,10 @@ ; ; SKX-LABEL: test17: ; SKX: ## BB#0: -; SKX-NEXT: movb (%rdi), %al -; SKX-NEXT: kmovd %esi, %k0 -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: vpmovm2q %k1, %zmm0 -; SKX-NEXT: vpmovm2q %k0, %zmm1 +; SKX-NEXT: kmovb (%rdi), %k0 +; SKX-NEXT: kmovd %esi, %k1 +; SKX-NEXT: vpmovm2q %k0, %zmm0 +; SKX-NEXT: vpmovm2q %k1, %zmm1 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] ; SKX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; SKX-NEXT: vpmovq2m %zmm2, %k0 Index: test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -23,8 +23,7 @@ define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm512_mask_broadcastd_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1} ; X32-NEXT: retl ; @@ -45,8 +44,7 @@ define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm512_maskz_broadcastd_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; X32-NEXT: retl ; @@ -184,8 +182,7 @@ define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) { ; X32-LABEL: test_mm512_mask_broadcastss_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} ; X32-NEXT: retl ; @@ -203,8 +200,7 @@ define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) { ; X32-LABEL: test_mm512_maskz_broadcastss_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; X32-NEXT: retl ; @@ -288,8 +284,7 @@ define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_mask_movehdup_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; X32-NEXT: retl ; @@ -307,8 +302,7 @@ define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) { ; X32-LABEL: test_mm512_maskz_movehdup_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; X32-NEXT: retl ; @@ -340,8 +334,7 @@ define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_mask_moveldup_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; X32-NEXT: retl ; @@ -359,8 +352,7 @@ define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) { ; X32-LABEL: test_mm512_maskz_moveldup_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; X32-NEXT: retl ; @@ -444,8 +436,7 @@ define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_mask_permute_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; X32-NEXT: retl ; @@ -463,8 +454,7 @@ define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) { ; X32-LABEL: test_mm512_maskz_permute_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; X32-NEXT: retl ; @@ -602,8 +592,7 @@ define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) { ; X32-LABEL: test_mm512_mask_shuffle_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; X32-NEXT: retl ; @@ -624,8 +613,7 @@ define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) { ; X32-LABEL: test_mm512_maskz_shuffle_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; X32-NEXT: retl ; @@ -714,8 +702,7 @@ define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { ; X32-LABEL: test_mm512_mask_unpackhi_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] ; X32-NEXT: retl ; @@ -737,8 +724,7 @@ define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; X32-LABEL: test_mm512_maskz_unpackhi_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X32-NEXT: retl ; @@ -877,8 +863,7 @@ define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { ; X32-LABEL: test_mm512_mask_unpackhi_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] ; X32-NEXT: retl ; @@ -896,8 +881,7 @@ define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_maskz_unpackhi_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X32-NEXT: retl ; @@ -932,8 +916,7 @@ define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { ; X32-LABEL: test_mm512_mask_unpacklo_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] ; X32-NEXT: retl ; @@ -955,8 +938,7 @@ define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; X32-LABEL: test_mm512_maskz_unpacklo_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X32-NEXT: retl ; @@ -1095,8 +1077,7 @@ define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { ; X32-LABEL: test_mm512_mask_unpacklo_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] ; X32-NEXT: retl ; @@ -1114,8 +1095,7 @@ define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_maskz_unpacklo_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X32-NEXT: retl ; Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -187,11 +187,9 @@ ; ; AVX512BW-LABEL: mask8_mem: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: movzbl (%rdi), %eax -; AVX512BW-NEXT: kmovd %eax, %k0 +; AVX512BW-NEXT: kmovb (%rdi), %k0 ; AVX512BW-NEXT: knotw %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: kmovb %k0, (%rdi) ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: mask8_mem: @@ -485,8 +483,7 @@ ; AVX512BW-LABEL: conv1: ; AVX512BW: ## BB#0: ## %entry ; AVX512BW-NEXT: kxnorw %k0, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: kmovb %k0, (%rdi) ; AVX512BW-NEXT: movb $-2, -{{[0-9]+}}(%rsp) ; AVX512BW-NEXT: movb $-2, %al ; AVX512BW-NEXT: retq @@ -630,8 +627,7 @@ ; AVX512BW-NEXT: movb $85, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: testb %al, %al +; AVX512BW-NEXT: ktestb %k0, %k0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1168,50 +1164,42 @@ ; KNL-LABEL: test18: ; KNL: ## BB#0: ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: kmovw %esi, %k0 -; KNL-NEXT: kshiftlw $7, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax -; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kmovw %esi, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: kmovw %ecx, %k1 -; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k0 -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftlw $7, %k1, %k1 -; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: korw %k0, %k1, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test18: ; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kshiftlw $7, %k1, %k2 +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: kmovd %esi, %k2 +; SKX-NEXT: kshiftlw $7, %k2, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kshiftlw $6, %k2, %k2 ; SKX-NEXT: kshiftrw $15, %k2, %k2 -; SKX-NEXT: kmovd %k2, %eax -; SKX-NEXT: kshiftlw $6, %k1, %k1 -; SKX-NEXT: kshiftrw $15, %k1, %k1 -; SKX-NEXT: kmovd %k1, %ecx -; SKX-NEXT: vpmovm2q %k0, %zmm0 -; SKX-NEXT: kmovd %ecx, %k0 -; SKX-NEXT: vpmovm2q %k0, %zmm1 +; SKX-NEXT: vpmovm2q %k1, %zmm0 +; SKX-NEXT: vpmovm2q %k2, %zmm1 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; SKX-NEXT: vpmovq2m %zmm2, %k0 -; SKX-NEXT: kshiftlb $1, %k0, %k0 -; SKX-NEXT: kshiftrb $1, %k0, %k0 -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: kshiftlb $7, %k1, %k1 -; SKX-NEXT: korb %k1, %k0, %k0 +; SKX-NEXT: vpmovq2m %zmm2, %k1 +; SKX-NEXT: kshiftlb $1, %k1, %k1 +; SKX-NEXT: kshiftrb $1, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: korb %k0, %k1, %k0 ; SKX-NEXT: vpmovm2w %k0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -1219,25 +1207,21 @@ ; AVX512BW-LABEL: test18: ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: kmovd %esi, %k0 -; AVX512BW-NEXT: kshiftlw $7, %k0, %k2 -; AVX512BW-NEXT: kshiftrw $15, %k2, %k2 -; AVX512BW-NEXT: kmovd %k2, %eax -; AVX512BW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512BW-NEXT: kmovd %esi, %k2 +; AVX512BW-NEXT: kshiftlw $7, %k2, %k0 ; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $15, %k2, %k2 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kshiftlw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vzeroupper @@ -1245,25 +1229,21 @@ ; ; AVX512DQ-LABEL: test18: ; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: kmovw %edi, %k0 -; AVX512DQ-NEXT: kmovw %esi, %k1 -; AVX512DQ-NEXT: kshiftlw $7, %k1, %k2 +; AVX512DQ-NEXT: kmovw %edi, %k1 +; AVX512DQ-NEXT: kmovw %esi, %k2 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $15, %k2, %k2 -; AVX512DQ-NEXT: kmovw %k2, %eax -; AVX512DQ-NEXT: kshiftlw $6, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 -; AVX512DQ-NEXT: kmovw %k1, %ecx -; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 -; AVX512DQ-NEXT: kmovw %ecx, %k0 -; AVX512DQ-NEXT: vpmovm2q %k0, %zmm1 +; AVX512DQ-NEXT: vpmovm2q %k1, %zmm0 +; AVX512DQ-NEXT: vpmovm2q %k2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovq2m %zmm2, %k0 -; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrb $1, %k0, %k0 -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: kshiftlb $7, %k1, %k1 -; AVX512DQ-NEXT: korb %k1, %k0, %k0 +; AVX512DQ-NEXT: vpmovq2m %zmm2, %k1 +; AVX512DQ-NEXT: kshiftlb $1, %k1, %k1 +; AVX512DQ-NEXT: kshiftrb $1, %k1, %k1 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512DQ-NEXT: korb %k0, %k1, %k0 ; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper @@ -1346,8 +1326,7 @@ ; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: kmovb %k0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1385,8 +1364,7 @@ ; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: kmovb %k0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1425,8 +1403,7 @@ ; AVX512BW-NEXT: kmovd %edi, %k0 ; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rsi) +; AVX512BW-NEXT: kmovb %k0, (%rsi) ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: store_v1i1: @@ -1466,8 +1443,7 @@ ; AVX512BW-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: kmovb %k0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1510,8 +1486,7 @@ ; AVX512BW-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: kmovb %k0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1553,8 +1528,7 @@ ; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 ; AVX512BW-NEXT: knotw %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: kmovb %k0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1766,8 +1740,7 @@ ; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1 ; AVX512BW-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} ; AVX512BW-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: testb %al, %al +; AVX512BW-NEXT: ktestb %k0, %k0 ; AVX512BW-NEXT: je LBB41_2 ; AVX512BW-NEXT: ## BB#1: ## %L1 ; AVX512BW-NEXT: vmovapd %zmm0, (%rdi) @@ -2514,8 +2487,7 @@ ; ; AVX512BW-LABEL: load_8i1: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: movzbl (%rdi), %eax -; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: kmovb (%rdi), %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -2575,8 +2547,7 @@ ; ; AVX512BW-LABEL: load_2i1: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: movzbl (%rdi), %eax -; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: kmovb (%rdi), %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vzeroupper @@ -2612,8 +2583,7 @@ ; ; AVX512BW-LABEL: load_4i1: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: movzbl (%rdi), %eax -; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: kmovb (%rdi), %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %YMM0 @@ -2743,8 +2713,7 @@ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: kmovb %k0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2781,8 +2750,7 @@ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: kmovb %k0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; Index: test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll @@ -84,8 +84,7 @@ define <8 x i64> @test_mm512_mask_broadcastw_epi16(<8 x i64> %a0, i32 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm512_mask_broadcastw_epi16: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastw %xmm1, %zmm0 {%k1} ; X32-NEXT: retl ; @@ -106,8 +105,7 @@ define <8 x i64> @test_mm512_maskz_broadcastw_epi16(i32 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm512_maskz_broadcastw_epi16: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ; X32-NEXT: retl ; @@ -241,8 +239,7 @@ define <8 x i64> @test_mm512_mask_unpackhi_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) { ; X32-LABEL: test_mm512_mask_unpackhi_epi16: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31] ; X32-NEXT: retl ; @@ -264,8 +261,7 @@ define <8 x i64> @test_mm512_maskz_unpackhi_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; X32-LABEL: test_mm512_maskz_unpackhi_epi16: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] ; X32-NEXT: retl ; @@ -367,8 +363,7 @@ define <8 x i64> @test_mm512_mask_unpacklo_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) { ; X32-LABEL: test_mm512_mask_unpacklo_epi16: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27] ; X32-NEXT: retl ; @@ -390,8 +385,7 @@ define <8 x i64> @test_mm512_maskz_unpacklo_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; X32-LABEL: test_mm512_maskz_unpacklo_epi16: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] ; X32-NEXT: retl ; Index: test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll @@ -23,8 +23,7 @@ define <2 x i64> @test_mm_mask_broadcastb_epi8(<2 x i64> %a0, i16 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm_mask_broadcastb_epi8: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastb %xmm1, %xmm0 {%k1} ; X32-NEXT: retl ; @@ -45,8 +44,7 @@ define <2 x i64> @test_mm_maskz_broadcastb_epi8(i16 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm_maskz_broadcastb_epi8: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ; X32-NEXT: retl ; @@ -82,8 +80,7 @@ define <4 x i64> @test_mm256_mask_broadcastb_epi8(<4 x i64> %a0, i32 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm256_mask_broadcastb_epi8: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastb %xmm1, %ymm0 {%k1} ; X32-NEXT: retl ; @@ -104,8 +101,7 @@ define <4 x i64> @test_mm256_maskz_broadcastb_epi8(i32 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm256_maskz_broadcastb_epi8: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ; X32-NEXT: retl ; @@ -200,8 +196,7 @@ define <4 x i64> @test_mm256_mask_broadcastw_epi16(<4 x i64> %a0, i16 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm256_mask_broadcastw_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastw %xmm1, %ymm0 {%k1} ; X32-NEXT: retl ; @@ -222,8 +217,7 @@ define <4 x i64> @test_mm256_maskz_broadcastw_epi16(i16 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm256_maskz_broadcastw_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ; X32-NEXT: retl ; Index: test/CodeGen/X86/bitcast-and-setcc-128.ll =================================================================== --- test/CodeGen/X86/bitcast-and-setcc-128.ll +++ test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -74,8 +74,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x0 = icmp sgt <4 x i32> %a, %b @@ -108,8 +107,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k1 ; AVX512-NEXT: vcmpltps %xmm2, %xmm3, %k0 {%k1} -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x0 = fcmp ogt <4 x float> %a, %b @@ -279,8 +277,7 @@ ; AVX512-NEXT: vpsraq $56, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x0 = icmp sgt <2 x i8> %a, %b @@ -417,8 +414,7 @@ ; AVX512-NEXT: vpsraq $48, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x0 = icmp sgt <2 x i16> %a, %b @@ -539,8 +535,7 @@ ; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x0 = icmp sgt <2 x i32> %a, %b @@ -592,8 +587,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x0 = icmp sgt <2 x i64> %a, %b @@ -626,8 +620,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k1 ; AVX512-NEXT: vcmpltpd %xmm2, %xmm3, %k0 {%k1} -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x0 = fcmp ogt <2 x double> %a, %b @@ -684,8 +677,7 @@ ; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x0 = icmp sgt <4 x i8> %a, %b @@ -742,8 +734,7 @@ ; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x0 = icmp sgt <4 x i16> %a, %b Index: test/CodeGen/X86/bitcast-and-setcc-256.ll =================================================================== --- test/CodeGen/X86/bitcast-and-setcc-256.ll +++ test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -96,8 +96,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 ; AVX512-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1} -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -144,8 +143,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1 ; AVX512-NEXT: vcmpltpd %ymm2, %ymm3, %k0 {%k1} -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq Index: test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll =================================================================== --- test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -44,8 +44,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: andb $3, %dil ; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vzeroupper @@ -107,8 +106,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: andb $15, %dil ; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -493,8 +491,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: andb $15, %dil ; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq Index: test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll =================================================================== --- test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -42,8 +42,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: andb $3, %dil ; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vzeroupper @@ -113,8 +112,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: andb $15, %dil ; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: vzeroupper @@ -596,8 +594,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: andb $15, %dil ; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq Index: test/CodeGen/X86/bitcast-int-to-vector-bool.ll =================================================================== --- test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -35,8 +35,7 @@ ; AVX512-LABEL: bitcast_i2_2i1: ; AVX512: # BB#0: ; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vzeroupper @@ -101,8 +100,7 @@ ; AVX512-LABEL: bitcast_i4_4i1: ; AVX512: # BB#0: ; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 Index: test/CodeGen/X86/bitcast-setcc-128.ll =================================================================== --- test/CodeGen/X86/bitcast-setcc-128.ll +++ test/CodeGen/X86/bitcast-setcc-128.ll @@ -60,8 +60,7 @@ ; AVX512-LABEL: v4i32: ; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x = icmp sgt <4 x i32> %a, %b @@ -87,8 +86,7 @@ ; AVX512-LABEL: v4f32: ; AVX512: # BB#0: ; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x = fcmp ogt <4 x float> %a, %b @@ -195,8 +193,7 @@ ; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0 ; AVX512-NEXT: vpsraq $56, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x = icmp sgt <2 x i8> %a, %b @@ -277,8 +274,7 @@ ; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0 ; AVX512-NEXT: vpsraq $48, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x = icmp sgt <2 x i16> %a, %b @@ -351,8 +347,7 @@ ; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 ; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x = icmp sgt <2 x i32> %a, %b @@ -388,8 +383,7 @@ ; AVX512-LABEL: v2i64: ; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x = icmp sgt <2 x i64> %a, %b @@ -415,8 +409,7 @@ ; AVX512-LABEL: v2f64: ; AVX512: # BB#0: ; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x = fcmp ogt <2 x double> %a, %b @@ -454,8 +447,7 @@ ; AVX512-NEXT: vpslld $24, %xmm0, %xmm0 ; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x = icmp sgt <4 x i8> %a, %b @@ -493,8 +485,7 @@ ; AVX512-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: retq %x = icmp sgt <4 x i16> %a, %b Index: test/CodeGen/X86/bitcast-setcc-256.ll =================================================================== --- test/CodeGen/X86/bitcast-setcc-256.ll +++ test/CodeGen/X86/bitcast-setcc-256.ll @@ -251,8 +251,7 @@ ; AVX512-LABEL: v4i64: ; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -292,8 +291,7 @@ ; AVX512-LABEL: v4f64: ; AVX512: # BB#0: ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k0 -; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq Index: test/CodeGen/X86/setcc-lowering.ll =================================================================== --- test/CodeGen/X86/setcc-lowering.ll +++ test/CodeGen/X86/setcc-lowering.ll @@ -87,8 +87,7 @@ ; KNL-32-NEXT: cmovlw %dx, %si ; KNL-32-NEXT: kmovw %esi, %k1 ; KNL-32-NEXT: kandw %k0, %k1, %k1 -; KNL-32-NEXT: kmovw %k1, %esi -; KNL-32-NEXT: testw %si, %si +; KNL-32-NEXT: ktestw %k1, %k1 ; KNL-32-NEXT: jne .LBB1_1 ; KNL-32-NEXT: # BB#2: # %for_exit600 ; KNL-32-NEXT: popl %esi Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -1251,8 +1251,7 @@ ; ; AVX512BW-LABEL: load_sext_2i1_to_2i64: ; AVX512BW: # BB#0: # %entry -; AVX512BW-NEXT: movzbl (%rdi), %eax -; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: kmovb (%rdi), %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vzeroupper @@ -1443,8 +1442,7 @@ ; ; AVX512BW-LABEL: load_sext_4i1_to_4i32: ; AVX512BW: # BB#0: # %entry -; AVX512BW-NEXT: movzbl (%rdi), %eax -; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: kmovb (%rdi), %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -1648,8 +1646,7 @@ ; ; AVX512BW-LABEL: load_sext_4i1_to_4i64: ; AVX512BW: # BB#0: # %entry -; AVX512BW-NEXT: movzbl (%rdi), %eax -; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: kmovb (%rdi), %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: retq @@ -2006,8 +2003,7 @@ ; ; AVX512BW-LABEL: load_sext_8i1_to_8i16: ; AVX512BW: # BB#0: # %entry -; AVX512BW-NEXT: movzbl (%rdi), %eax -; AVX512BW-NEXT: kmovd %eax, %k0 +; AVX512BW-NEXT: kmovb (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vzeroupper @@ -2408,8 +2404,7 @@ ; ; AVX512BW-LABEL: load_sext_8i1_to_8i32: ; AVX512BW: # BB#0: # %entry -; AVX512BW-NEXT: movzbl (%rdi), %eax -; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: kmovb (%rdi), %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: retq Index: test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -40,8 +40,7 @@ define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { ; X32-LABEL: combine_permvar_8f64_identity_mask: ; X32: # BB#0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X32-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} ; X32-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] @@ -78,8 +77,7 @@ define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { ; X32-LABEL: combine_permvar_8i64_identity_mask: ; X32: # BB#0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X32-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} ; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] @@ -116,8 +114,7 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { ; X32-LABEL: combine_vpermt2var_8f64_identity_mask: ; X32: # BB#0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X32-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z} ; X32-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] @@ -168,8 +165,7 @@ define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { ; X32-LABEL: combine_vpermt2var_8f64_movddup_mask: ; X32: # BB#0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; X32-NEXT: retl ; @@ -197,8 +193,7 @@ define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { ; X32-LABEL: combine_vpermt2var_8i64_identity_mask: ; X32: # BB#0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X32-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z} ; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] @@ -667,8 +662,7 @@ define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { ; X32-LABEL: combine_permvar_8i64_as_permq_mask: ; X32: # BB#0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] ; X32-NEXT: vmovdqa64 %zmm1, %zmm0 ; X32-NEXT: retl @@ -699,8 +693,7 @@ define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { ; X32-LABEL: combine_permvar_8f64_as_permpd_mask: ; X32: # BB#0: -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] ; X32-NEXT: vmovapd %zmm1, %zmm0 ; X32-NEXT: retl