Index: lib/Target/X86/CMakeLists.txt =================================================================== --- lib/Target/X86/CMakeLists.txt +++ lib/Target/X86/CMakeLists.txt @@ -21,6 +21,7 @@ X86CallFrameOptimization.cpp X86CallLowering.cpp X86CmovConversion.cpp + X86DomainReassignment.cpp X86ExpandPseudo.cpp X86FastISel.cpp X86FixupBWInsts.cpp Index: lib/Target/X86/X86.h =================================================================== --- lib/Target/X86/X86.h +++ lib/Target/X86/X86.h @@ -92,6 +92,10 @@ /// the upper portions of registers, and to save code size. FunctionPass *createX86FixupBWInsts(); +/// Return a Machine IR pass that reassigns instruction chains from one domain +/// to another, when profitable. +FunctionPass *createX86DomainReassignmentPass(); + void initializeFixupBWInstPassPass(PassRegistry &); /// This pass replaces EVEX ecnoded of AVX-512 instructiosn by VEX Index: lib/Target/X86/X86DomainReassignment.cpp =================================================================== --- /dev/null +++ lib/Target/X86/X86DomainReassignment.cpp @@ -0,0 +1,730 @@ +//===--- X86DomainReassignment.cpp - Selectively switch register classes---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass attempts to find instruction chains (closures) in one domain, +// and convert them to equivalent instructions in a different domain, +// if profitable. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-domain-reassignment" + +STATISTIC(NumClosuresConverted, "Number of closures converted by the pass"); + +static cl::opt DisableX86DomainReassignment( + "disable-x86-domain-reassignment", cl::Hidden, + cl::desc("X86: Disable Virtual Register Reassignment."), cl::init(false)); + +namespace { +enum RegDomain { NoDomain = -1, GPRDomain, MaskDomain, OtherDomain }; + +static bool isGPR(const TargetRegisterClass *RC) { + return RC == &X86::GR8RegClass || RC == &X86::GR16RegClass || + RC == &X86::GR32RegClass || RC == &X86::GR64RegClass; +} + +static bool isMask(const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) { + return TRI->getCommonSubClass(RC, &X86::VK16RegClass); +} + +static RegDomain getDomain(const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) { + if (isGPR(RC)) + return GPRDomain; + if (isMask(RC, TRI)) + return MaskDomain; + return OtherDomain; +} + +/// Return a register class equivalent to \p SrcRC, in \p Domain. +static const TargetRegisterClass *getDstRC(const TargetRegisterClass *SrcRC, + RegDomain Domain) { + assert(Domain == MaskDomain && "add domain"); + if (SrcRC == &X86::GR8RegClass) + return &X86::VK8RegClass; + if (SrcRC == &X86::GR16RegClass) + return &X86::VK16RegClass; + if (SrcRC == &X86::GR32RegClass) + return &X86::VK32RegClass; + if (SrcRC == &X86::GR64RegClass) + return &X86::VK64RegClass; + llvm_unreachable("add register class"); + return nullptr; +} + +/// Abstract Instruction Converter class. +class InstrConverter { +public: + virtual ~InstrConverter() {} + + /// \returns true if \p MI is legal to convert. + virtual bool isLegal(const MachineInstr *MI, + const TargetInstrInfo *TII) const = 0; + + /// Applies conversion to \p MI. + /// + /// \returns true if \p MI is no longer need, and can be deleted. + virtual bool ConvertInstr(MachineInstr *MI, const TargetInstrInfo *TII, + MachineRegisterInfo *MRI) const = 0; + + /// \returns the cost increment incurred by converting \p MI. + virtual double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const = 0; +}; + +/// An Instruction Converter which ignores the given instruction. +class InstrIgnore : public InstrConverter { +public: + bool isLegal(const MachineInstr *MI, + const TargetInstrInfo *TII) const override { + return true; + } + + bool ConvertInstr(MachineInstr *MI, const TargetInstrInfo *TII, + MachineRegisterInfo *MRI) const override { + return false; + } + + double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const override { + return 0; + } +}; + +/// An Instruction Converter which replaces an instruction with another. +class InstrReplacer : public InstrConverter { +public: + /// Opcode of the destination instruction. + unsigned DstOpcode; + + InstrReplacer(unsigned DstOpcode) : DstOpcode(DstOpcode) {} + + bool isLegal(const MachineInstr *MI, + const TargetInstrInfo *TII) const override { + // It's illegal to replace an instruction that implicitly defines a register + // with an instruction that doesn't, unless that register dead. + for (auto &MO : MI->implicit_operands()) + if (MO.isReg() && MO.isDef() && !MO.isDead() && + !TII->get(DstOpcode).hasImplicitDefOfPhysReg(MO.getReg())) + return false; + return true; + } + + bool ConvertInstr(MachineInstr *MI, const TargetInstrInfo *TII, + MachineRegisterInfo *MRI) const override { + MachineInstrBuilder Bld = + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(DstOpcode)); + for (auto &Op : MI->explicit_operands()) + Bld.add(Op); + return true; + } + + double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const override { + // Assuming instructions have the same cost. + return 0; + } +}; + +/// An Instruction Converter which replaces an instruction with another, and +/// adds a COPY from the new instruction's destination to the old one's. +class InstrReplacerDstCOPY : public InstrConverter { +public: + unsigned DstOpcode; + + InstrReplacerDstCOPY(unsigned DstOpcode) + + : DstOpcode(DstOpcode) {} + + bool isLegal(const MachineInstr *MI, + const TargetInstrInfo *TII) const override { + return true; + } + + bool ConvertInstr(MachineInstr *MI, const TargetInstrInfo *TII, + MachineRegisterInfo *MRI) const override { + MachineBasicBlock *MBB = MI->getParent(); + auto &DL = MI->getDebugLoc(); + + unsigned Reg = MRI->createVirtualRegister( + TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(), + *MBB->getParent())); + MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg); + for (unsigned Idx = 1, End = MI->getNumOperands(); Idx < End; ++Idx) + Bld.add(MI->getOperand(Idx)); + + BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY)) + .add(MI->getOperand(0)) + .addReg(Reg); + + return true; + } + + double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const override { + // Assuming instructions have the same cost, and that COPY is in the same + // domain so it will be eliminated. + return 0; + } +}; + +/// An Instruction Converter for replacing COPY instructions. +class InstrCOPYReplacer : public InstrReplacer { +public: + RegDomain DstDomain; + + InstrCOPYReplacer(RegDomain DstDomain, unsigned DstOpcode) + + : InstrReplacer(DstOpcode), DstDomain(DstDomain) {} + + bool isLegal(const MachineInstr *MI, + const TargetInstrInfo *TII) const override { + return MI->getOpcode() == TargetOpcode::COPY; + } + + double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const override { + assert(MI->getOpcode() == TargetOpcode::COPY && "Expected a COPY"); + + for (auto &MO : MI->operands()) { + if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + return 1; + + RegDomain OpDomain = getDomain(MRI->getRegClass(MO.getReg()), + MRI->getTargetRegisterInfo()); + // Converting a cross domain COPY to a same domain COPY should eliminate + // an insturction + if (OpDomain == DstDomain) + return -1; + } + return 0; + } +}; + +/// An Instruction Converter which replaces an instruction with a COPY. +class InstrReplaceWithCopy : public InstrConverter { +public: + // Source instruction operand Index, to be used as the COPY source. + unsigned SrcOpIdx; + + InstrReplaceWithCopy(unsigned SrcOpIdx) : SrcOpIdx(SrcOpIdx) {} + + bool isLegal(const MachineInstr *MI, + const TargetInstrInfo *TII) const override { + return true; + } + + bool ConvertInstr(MachineInstr *MI, const TargetInstrInfo *TII, + MachineRegisterInfo *MRI) const override { + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(TargetOpcode::COPY)) + .add({MI->getOperand(0), MI->getOperand(SrcOpIdx)}); + return true; + } + + double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const override { + return 0; + } +}; + +/// An Instruction Converter which completely deletes an instruction. +class InstrDeleter : public InstrConverter { +public: + bool isLegal(const MachineInstr *MI, + const TargetInstrInfo *TII) const override { + return true; + } + + bool ConvertInstr(MachineInstr *MI, const TargetInstrInfo *TII, + MachineRegisterInfo *MRI) const override { + return true; + } + + double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const override { + return 0; + } +}; + +// Key type to be used by the Instruction Converters map. +// A converter is identified by +typedef std::pair InstrConverterKeyTy; + +typedef DenseMap InstrConverterMap; + +/// A closure is a set of virtual register representing all of the edges in +/// the closure, as well as all of the instructions connected by those edges. +/// +/// A closure may encompass virtual registers in the same register bank that +/// have different widths. For example, it may contain 32-bit GPRs as well as +/// 64-bit GPRs. +/// +/// A closure that computes an address (i.e. defines a virtual register that is +/// used in a memory operand) excludes the instructions that contain memory +/// operands using the address. Such an instruction will be included in a +/// different closure that manipulates the loaded or stored value. +class Closure { +private: + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI; + + /// Virtual registers in the closure. + DenseSet Edges; + + /// Instructions in the closure. + SmallVector Instrs; + + /// A map of available Instruction Converters. + const InstrConverterMap &Converters; + + /// The register domain of this closure. + RegDomain Domain; + + /// Domains which this closure can legally be reassigned to. + SmallVector LegalDstDomains; + + /// Enqueue \p Reg to be considered for addition to the closure. + void visitRegister(unsigned Reg, SmallVectorImpl &Worklist); + + /// Add \p MI to this closure. + void encloseInstr(MachineInstr *MI); + + /// All edges that are included in some closure. + DenseSet &EnclosedEdges; + + /// All instructions that are included in some closure. + DenseMap &EnclosedInstrs; + +public: + Closure(const TargetInstrInfo *TII, MachineRegisterInfo *MRI, + const InstrConverterMap &Converters, + const SmallVector &LegalDstDomains, + DenseSet &EnclosedEdges, + DenseMap &EnclosedInstrs) + : TII(TII), MRI(MRI), Converters(Converters), Domain(NoDomain), + LegalDstDomains(LegalDstDomains), EnclosedEdges(EnclosedEdges), + EnclosedInstrs(EnclosedInstrs) {} + + /// Starting from \Reg, expand the closure as much as possible. + void buildClosure(unsigned E); + + /// Calculate the total cost of reassigning the closure to \p Domain. + double calculateCost(RegDomain Domain) const; + + /// Reassign the closure to \p Domain. + void Reassign(RegDomain Domain) const; + + /// Mark this closure as illegal for reassignment to all domains. + void setAllIllegal() { LegalDstDomains.clear(); } + + /// Mark this closure as illegal for reassignment for domain \p RD + void setIllegal(RegDomain RD) { + auto I = find(LegalDstDomains, RD); + if (I != LegalDstDomains.end()) + LegalDstDomains.erase(I); + } + + /// \returns true if this closure has domains which are legal to reassign to. + bool hasLegalDstDomain() const { return !LegalDstDomains.empty(); } + + /// \returns true if is legal to reassign this closure to domain \p RD. + bool isLegal(RegDomain RD) const { return is_contained(LegalDstDomains, RD); } + + bool empty() const { return Edges.empty(); } +}; + +class X86DomainReassignment : public MachineFunctionPass { +public: + static char ID; + + X86DomainReassignment() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return "X86 Domain Reassignment Pass"; + } + +private: + const X86Subtarget *STI; + MachineRegisterInfo *MRI; + const X86InstrInfo *TII; + + /// A map of available Instruction Converters. + InstrConverterMap Converters; + + /// Initialize Converters map. + void initConverters(); +}; + +char X86DomainReassignment::ID = 0; +} // End anonymous namespace. + +void Closure::visitRegister(unsigned Reg, SmallVectorImpl &Worklist) { + if (EnclosedEdges.count(Reg)) + return; + + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return; + + if (!MRI->hasOneDef(Reg)) + return; + + RegDomain RD = getDomain(MRI->getRegClass(Reg), MRI->getTargetRegisterInfo()); + // First edge in closure sets the domain. + if (Domain == NoDomain) + Domain = RD; + + if (Domain != RD) + return; + + Worklist.push_back(Reg); +} + +void Closure::encloseInstr(MachineInstr *MI) { + auto I = EnclosedInstrs.find(MI); + if (I != EnclosedInstrs.end()) { + if (I->second != this) + // Instruction already belongs to another closure, avoid conflicts between + // closure and mark this closure as illegal. + setAllIllegal(); + return; + } + + EnclosedInstrs[MI] = this; + Instrs.push_back(MI); + + for (RegDomain D : LegalDstDomains) { + InstrConverter *IC = Converters.lookup({D, MI->getOpcode()}); + if (!IC || !IC->isLegal(MI, TII)) + // Did not find a converter for the insruction, or the converter cannot + // convert the instruction, mark the closure as illegal for reassignment + // to the given domain. + setIllegal(D); + } +} + +double Closure::calculateCost(RegDomain DstDomain) const { + assert(isLegal(DstDomain) && "Cannot calculate cost for illegal closure"); + + double Cost = 0.0; + for (auto MI : Instrs) + Cost += + Converters.lookup({DstDomain, MI->getOpcode()})->getExtraCost(MI, MRI); + return Cost; +} + +void Closure::Reassign(RegDomain Domain) const { + assert(isLegal(Domain) && "Cannot convert illegal closure"); + + // Iterate all instructions in the closure, convert each one using the + // appropriate converter. + SmallVector ToErase; + for (auto MI : Instrs) + if (Converters.lookup({Domain, MI->getOpcode()}) + ->ConvertInstr(MI, TII, MRI)) + ToErase.push_back(MI); + + // Iterate all registers in the closure, replace them with registers in the + // destination domain. + for (unsigned Reg : Edges) { + unsigned DestReg = + MRI->createVirtualRegister(getDstRC(MRI->getRegClass(Reg), Domain)); + MRI->replaceRegWith(Reg, DestReg); + for (auto &MO : MRI->use_operands(DestReg)) { + if (MO.isReg()) + // Remove all subregister references as they are not valid in the + // destination domain. + MO.setSubReg(0); + } + } + + for (auto MI : ToErase) + MI->eraseFromParent(); +} + +/// \returns true when \p Reg is used as part of an address calculation in \p +/// MI. +static bool usedAsAddr(const MachineInstr &MI, unsigned Reg, + const TargetInstrInfo *TII) { + if (!MI.mayLoadOrStore()) + return false; + + const MCInstrDesc &Desc = TII->get(MI.getOpcode()); + int MemOpStart = X86II::getMemoryOperandNo(Desc.TSFlags); + if (MemOpStart == -1) + return false; + + MemOpStart += X86II::getOperandBias(Desc); + for (unsigned MemOpIdx = MemOpStart, + MemOpEnd = MemOpStart + X86::AddrNumOperands; + MemOpIdx < MemOpEnd; ++MemOpIdx) { + auto &Op = MI.getOperand(MemOpIdx); + if (Op.isReg() && Op.getReg() == Reg) + return true; + } + return false; +} + +void Closure::buildClosure(unsigned Reg) { + SmallVector Worklist; + visitRegister(Reg, Worklist); + while (!Worklist.empty()) { + unsigned CurReg = Worklist.pop_back_val(); + + // Register already in this closure. + if (!Edges.insert(CurReg).second) + continue; + + MachineInstr *DefMI = MRI->getVRegDef(CurReg); + encloseInstr(DefMI); + + // Add register used by the defining MI to the worklist. + // Do not add registers which are used in address calculation, they will be + // added to a different closure. + int OpEnd = DefMI->getNumOperands(); + const MCInstrDesc &Desc = DefMI->getDesc(); + int MemOp = X86II::getMemoryOperandNo(Desc.TSFlags); + if (MemOp != -1) + MemOp += X86II::getOperandBias(Desc); + for (int OpIdx = 0; OpIdx < OpEnd; ++OpIdx) { + if (OpIdx == MemOp) { + // skip address calculation. + OpIdx += (X86::AddrNumOperands - 1); + continue; + } + auto &Op = DefMI->getOperand(OpIdx); + if (!Op.isReg() || !Op.isUse()) + continue; + visitRegister(Op.getReg(), Worklist); + } + + // Expand closure through register uses. + for (auto &UseMI : MRI->use_instructions(CurReg)) { + // We would like to avoid converting closures which calculare addresses, + // as this should remain in GPRs. + if (usedAsAddr(UseMI, CurReg, TII)) { + setAllIllegal(); + continue; + } + encloseInstr(&UseMI); + + for (auto &DefOp : UseMI.defs()) { + if (!DefOp.isReg()) + continue; + + unsigned DefReg = DefOp.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(DefReg)) { + setAllIllegal(); + continue; + } + visitRegister(DefReg, Worklist); + } + } + } +} + +void X86DomainReassignment::initConverters() { + Converters[{MaskDomain, TargetOpcode::PHI}] = new InstrIgnore(); + + Converters[{MaskDomain, TargetOpcode::IMPLICIT_DEF}] = new InstrDeleter(); + + Converters[{MaskDomain, TargetOpcode::INSERT_SUBREG}] = + new InstrReplaceWithCopy(2); + + Converters[{MaskDomain, TargetOpcode::COPY}] = + new InstrCOPYReplacer(MaskDomain, TargetOpcode::COPY); + + auto createReplacerDstCOPY = [&](unsigned From, unsigned To) { + Converters[{MaskDomain, From}] = new InstrReplacerDstCOPY(To); + }; + + createReplacerDstCOPY(X86::MOVZX32rm16, X86::KMOVWkm); + createReplacerDstCOPY(X86::MOVZX64rm16, X86::KMOVWkm); + + createReplacerDstCOPY(X86::MOVZX32rr16, X86::KMOVWkk); + createReplacerDstCOPY(X86::MOVZX64rr16, X86::KMOVWkk); + + if (STI->hasDQI()) { + createReplacerDstCOPY(X86::MOVZX16rm8, X86::KMOVBkm); + createReplacerDstCOPY(X86::MOVZX32rm8, X86::KMOVBkm); + createReplacerDstCOPY(X86::MOVZX64rm8, X86::KMOVBkm); + + createReplacerDstCOPY(X86::MOVZX16rr8, X86::KMOVBkk); + createReplacerDstCOPY(X86::MOVZX32rr8, X86::KMOVBkk); + createReplacerDstCOPY(X86::MOVZX64rr8, X86::KMOVBkk); + } + + auto createReplacer = [&](unsigned From, unsigned To) { + Converters[{MaskDomain, From}] = new InstrReplacer(To); + }; + + createReplacer(X86::MOV16rm, X86::KMOVWkm); + createReplacer(X86::MOV16mr, X86::KMOVWmk); + createReplacer(X86::MOV16rr, X86::KMOVWkk); + createReplacer(X86::SHR16ri, X86::KSHIFTRWri); + createReplacer(X86::SHL16ri, X86::KSHIFTLWri); + createReplacer(X86::NOT16r, X86::KNOTWrr); + createReplacer(X86::OR16rr, X86::KORWrr); + createReplacer(X86::AND16rr, X86::KANDWrr); + createReplacer(X86::XOR16rr, X86::KXORWrr); + + if (STI->hasBWI()) { + createReplacer(X86::MOV32rm, X86::KMOVDkm); + createReplacer(X86::MOV64rm, X86::KMOVQkm); + + createReplacer(X86::MOV32mr, X86::KMOVDmk); + createReplacer(X86::MOV64mr, X86::KMOVQmk); + + createReplacer(X86::MOV32rr, X86::KMOVDkk); + createReplacer(X86::MOV64rr, X86::KMOVQkk); + + createReplacer(X86::SHR32ri, X86::KSHIFTRDri); + createReplacer(X86::SHR64ri, X86::KSHIFTRQri); + + createReplacer(X86::SHL32ri, X86::KSHIFTLDri); + createReplacer(X86::SHL64ri, X86::KSHIFTLQri); + + createReplacer(X86::ADD32rr, X86::KADDDrr); + createReplacer(X86::ADD64rr, X86::KADDQrr); + + createReplacer(X86::NOT32r, X86::KNOTDrr); + createReplacer(X86::NOT64r, X86::KNOTQrr); + + createReplacer(X86::OR32rr, X86::KORDrr); + createReplacer(X86::OR64rr, X86::KORQrr); + + createReplacer(X86::AND32rr, X86::KANDDrr); + createReplacer(X86::AND64rr, X86::KANDQrr); + + createReplacer(X86::ANDN32rr, X86::KANDNDrr); + createReplacer(X86::ANDN64rr, X86::KANDNQrr); + + createReplacer(X86::XOR32rr, X86::KXORDrr); + createReplacer(X86::XOR64rr, X86::KXORQrr); + + createReplacer(X86::TEST32rr, X86::KTESTDrr); + createReplacer(X86::TEST64rr, X86::KTESTQrr); + } + + if (STI->hasDQI()) { + createReplacer(X86::ADD8rr, X86::KADDBrr); + createReplacer(X86::ADD16rr, X86::KADDWrr); + + createReplacer(X86::AND8rr, X86::KANDBrr); + + createReplacer(X86::MOV8rm, X86::KMOVBkm); + createReplacer(X86::MOV8mr, X86::KMOVBmk); + createReplacer(X86::MOV8rr, X86::KMOVBkk); + + createReplacer(X86::NOT8r, X86::KNOTBrr); + + createReplacer(X86::OR8rr, X86::KORBrr); + + createReplacer(X86::SHR8ri, X86::KSHIFTRBri); + createReplacer(X86::SHL8ri, X86::KSHIFTLBri); + + createReplacer(X86::TEST8rr, X86::KTESTBrr); + createReplacer(X86::TEST16rr, X86::KTESTWrr); + + createReplacer(X86::XOR8rr, X86::KXORBrr); + } +} + +bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + if (DisableX86DomainReassignment) + return false; + + DEBUG(dbgs() << "***** Machine Function before Domain Reassignment *****\n"); + DEBUG(MF.print(dbgs())); + + STI = &MF.getSubtarget(); + // GPR->K is the only transformation currently supported, bail out early if no + // AVX512. + if (!STI->hasAVX512()) + return false; + + MRI = &MF.getRegInfo(); + TII = STI->getInstrInfo(); + initConverters(); + bool Changed = false; + + DenseSet EnclosedEdges; + DenseMap EnclosedInstrs; + + std::vector Closures; + + // Go over all virtual registers and calculate a closure. + for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(Idx); + + // GPR only current source domain supported. + if (!isGPR(MRI->getRegClass(Reg))) + continue; + + // Register already in closure. + if (EnclosedEdges.count(Reg)) + continue; + + // Calculate closure starting with Reg. + Closure C(TII, MRI, Converters, {MaskDomain}, EnclosedEdges, + EnclosedInstrs); + C.buildClosure(Reg); + + // Collect all closures that can potentially be converted. + if (!C.empty() && C.isLegal(MaskDomain)) + Closures.push_back(std::move(C)); + } + + for (Closure &C : Closures) + if (C.calculateCost(MaskDomain) < 0.0) { + C.Reassign(MaskDomain); + ++NumClosuresConverted; + Changed = true; + } + + for (auto I : Converters) + delete I.second; + + DEBUG(dbgs() << "***** Machine Function after Domain Reassignment *****\n"); + DEBUG(MF.print(dbgs())); + + return Changed; +} + +/// Returns an instance of the Domain Reassignment pass. +FunctionPass *llvm::createX86DomainReassignmentPass() { + return new X86DomainReassignment(); +} Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -312,6 +312,7 @@ bool addGlobalInstructionSelect() override; bool addILPOpts() override; bool addPreISel() override; + void addMachineSSAOptimization() override; void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreEmitPass() override; @@ -405,6 +406,10 @@ addPass(createX86WinAllocaExpander()); } +void X86PassConfig::addMachineSSAOptimization() { + addPass(createX86DomainReassignmentPass()); + TargetPassConfig::addMachineSSAOptimization(); +} void X86PassConfig::addPostRegAlloc() { addPass(createX86FloatingPointStackifierPass()); Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -391,11 +391,10 @@ ; ; SKX-LABEL: test16: ; SKX: ## BB#0: -; SKX-NEXT: movb (%rdi), %al -; SKX-NEXT: kmovd %esi, %k0 -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: vpmovm2d %k1, %zmm0 -; SKX-NEXT: vpmovm2d %k0, %zmm1 +; SKX-NEXT: kmovb (%rdi), %k0 +; SKX-NEXT: kmovd %esi, %k1 +; SKX-NEXT: vpmovm2d %k0, %zmm0 +; SKX-NEXT: vpmovm2d %k1, %zmm1 ; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15] ; SKX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; SKX-NEXT: vpmovd2m %zmm2, %k0 @@ -428,11 +427,10 @@ ; ; SKX-LABEL: test17: ; SKX: ## BB#0: -; SKX-NEXT: movb (%rdi), %al -; SKX-NEXT: kmovd %esi, %k0 -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: vpmovm2q %k1, %zmm0 -; SKX-NEXT: vpmovm2q %k0, %zmm1 +; SKX-NEXT: kmovb (%rdi), %k0 +; SKX-NEXT: kmovd %esi, %k1 +; SKX-NEXT: vpmovm2q %k0, %zmm0 +; SKX-NEXT: vpmovm2q %k1, %zmm1 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] ; SKX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; SKX-NEXT: vpmovq2m %zmm2, %k0 Index: test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -23,8 +23,7 @@ define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm512_mask_broadcastd_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1} ; X32-NEXT: retl ; @@ -45,8 +44,7 @@ define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm512_maskz_broadcastd_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; X32-NEXT: retl ; @@ -184,8 +182,7 @@ define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) { ; X32-LABEL: test_mm512_mask_broadcastss_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} ; X32-NEXT: retl ; @@ -203,8 +200,7 @@ define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) { ; X32-LABEL: test_mm512_maskz_broadcastss_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; X32-NEXT: retl ; @@ -288,8 +284,7 @@ define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_mask_movehdup_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; X32-NEXT: retl ; @@ -307,8 +302,7 @@ define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) { ; X32-LABEL: test_mm512_maskz_movehdup_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; X32-NEXT: retl ; @@ -340,8 +334,7 @@ define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_mask_moveldup_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; X32-NEXT: retl ; @@ -359,8 +352,7 @@ define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) { ; X32-LABEL: test_mm512_maskz_moveldup_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; X32-NEXT: retl ; @@ -444,8 +436,7 @@ define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_mask_permute_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; X32-NEXT: retl ; @@ -463,8 +454,7 @@ define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) { ; X32-LABEL: test_mm512_maskz_permute_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; X32-NEXT: retl ; @@ -602,8 +592,7 @@ define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) { ; X32-LABEL: test_mm512_mask_shuffle_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; X32-NEXT: retl ; @@ -624,8 +613,7 @@ define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) { ; X32-LABEL: test_mm512_maskz_shuffle_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; X32-NEXT: retl ; @@ -714,8 +702,7 @@ define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { ; X32-LABEL: test_mm512_mask_unpackhi_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] ; X32-NEXT: retl ; @@ -737,8 +724,7 @@ define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; X32-LABEL: test_mm512_maskz_unpackhi_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X32-NEXT: retl ; @@ -877,8 +863,7 @@ define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { ; X32-LABEL: test_mm512_mask_unpackhi_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] ; X32-NEXT: retl ; @@ -896,8 +881,7 @@ define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_maskz_unpackhi_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X32-NEXT: retl ; @@ -932,8 +916,7 @@ define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { ; X32-LABEL: test_mm512_mask_unpacklo_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] ; X32-NEXT: retl ; @@ -955,8 +938,7 @@ define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; X32-LABEL: test_mm512_maskz_unpacklo_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X32-NEXT: retl ; @@ -1095,8 +1077,7 @@ define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { ; X32-LABEL: test_mm512_mask_unpacklo_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] ; X32-NEXT: retl ; @@ -1114,8 +1095,7 @@ define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_maskz_unpacklo_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X32-NEXT: retl ; Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -1168,44 +1168,42 @@ ; KNL-LABEL: test18: ; KNL: ## BB#0: ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: kmovw %esi, %k0 -; KNL-NEXT: kshiftlw $7, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kmovw %esi, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: kmovw %ecx, %k1 -; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k0 -; KNL-NEXT: kshiftlw $7, %k2, %k1 -; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: korw %k0, %k1, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test18: ; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kshiftlw $7, %k1, %k2 +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: kmovd %esi, %k2 +; SKX-NEXT: kshiftlw $7, %k2, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kshiftlw $6, %k2, %k2 ; SKX-NEXT: kshiftrw $15, %k2, %k2 -; SKX-NEXT: kshiftlw $6, %k1, %k1 -; SKX-NEXT: kshiftrw $15, %k1, %k1 -; SKX-NEXT: vpmovm2q %k0, %zmm0 -; SKX-NEXT: vpmovm2q %k1, %zmm1 +; SKX-NEXT: vpmovm2q %k1, %zmm0 +; SKX-NEXT: vpmovm2q %k2, %zmm1 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; SKX-NEXT: vpmovq2m %zmm2, %k0 -; SKX-NEXT: kshiftlb $1, %k0, %k0 -; SKX-NEXT: kshiftrb $1, %k0, %k0 -; SKX-NEXT: kshiftlb $7, %k2, %k1 -; SKX-NEXT: korb %k1, %k0, %k0 +; SKX-NEXT: vpmovq2m %zmm2, %k1 +; SKX-NEXT: kshiftlb $1, %k1, %k1 +; SKX-NEXT: kshiftrb $1, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: korb %k0, %k1, %k0 ; SKX-NEXT: vpmovm2w %k0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -1213,23 +1211,21 @@ ; AVX512BW-LABEL: test18: ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: kmovd %esi, %k0 -; AVX512BW-NEXT: kshiftlw $7, %k0, %k2 -; AVX512BW-NEXT: kshiftrw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512BW-NEXT: kmovd %esi, %k2 +; AVX512BW-NEXT: kshiftlw $7, %k2, %k0 ; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $15, %k2, %k2 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $7, %k2, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vzeroupper @@ -1237,21 +1233,21 @@ ; ; AVX512DQ-LABEL: test18: ; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: kmovw %edi, %k0 -; AVX512DQ-NEXT: kmovw %esi, %k1 -; AVX512DQ-NEXT: kshiftlw $7, %k1, %k2 +; AVX512DQ-NEXT: kmovw %edi, %k1 +; AVX512DQ-NEXT: kmovw %esi, %k2 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 ; AVX512DQ-NEXT: kshiftrw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $6, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 -; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 -; AVX512DQ-NEXT: vpmovm2q %k1, %zmm1 +; AVX512DQ-NEXT: vpmovm2q %k1, %zmm0 +; AVX512DQ-NEXT: vpmovm2q %k2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovq2m %zmm2, %k0 -; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrb $1, %k0, %k0 -; AVX512DQ-NEXT: kshiftlb $7, %k2, %k1 -; AVX512DQ-NEXT: korb %k1, %k0, %k0 +; AVX512DQ-NEXT: vpmovq2m %zmm2, %k1 +; AVX512DQ-NEXT: kshiftlb $1, %k1, %k1 +; AVX512DQ-NEXT: kshiftrb $1, %k1, %k1 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512DQ-NEXT: korb %k0, %k1, %k0 ; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper Index: test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll @@ -84,8 +84,7 @@ define <8 x i64> @test_mm512_mask_broadcastw_epi16(<8 x i64> %a0, i32 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm512_mask_broadcastw_epi16: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastw %xmm1, %zmm0 {%k1} ; X32-NEXT: retl ; @@ -106,8 +105,7 @@ define <8 x i64> @test_mm512_maskz_broadcastw_epi16(i32 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm512_maskz_broadcastw_epi16: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ; X32-NEXT: retl ; @@ -241,8 +239,7 @@ define <8 x i64> @test_mm512_mask_unpackhi_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) { ; X32-LABEL: test_mm512_mask_unpackhi_epi16: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31] ; X32-NEXT: retl ; @@ -264,8 +261,7 @@ define <8 x i64> @test_mm512_maskz_unpackhi_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; X32-LABEL: test_mm512_maskz_unpackhi_epi16: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] ; X32-NEXT: retl ; @@ -367,8 +363,7 @@ define <8 x i64> @test_mm512_mask_unpacklo_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) { ; X32-LABEL: test_mm512_mask_unpacklo_epi16: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27] ; X32-NEXT: retl ; @@ -390,8 +385,7 @@ define <8 x i64> @test_mm512_maskz_unpacklo_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; X32-LABEL: test_mm512_maskz_unpacklo_epi16: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] ; X32-NEXT: retl ; Index: test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll @@ -23,8 +23,7 @@ define <2 x i64> @test_mm_mask_broadcastb_epi8(<2 x i64> %a0, i16 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm_mask_broadcastb_epi8: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastb %xmm1, %xmm0 {%k1} ; X32-NEXT: retl ; @@ -45,8 +44,7 @@ define <2 x i64> @test_mm_maskz_broadcastb_epi8(i16 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm_maskz_broadcastb_epi8: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ; X32-NEXT: retl ; @@ -82,8 +80,7 @@ define <4 x i64> @test_mm256_mask_broadcastb_epi8(<4 x i64> %a0, i32 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm256_mask_broadcastb_epi8: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastb %xmm1, %ymm0 {%k1} ; X32-NEXT: retl ; @@ -104,8 +101,7 @@ define <4 x i64> @test_mm256_maskz_broadcastb_epi8(i32 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm256_maskz_broadcastb_epi8: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ; X32-NEXT: retl ; @@ -200,8 +196,7 @@ define <4 x i64> @test_mm256_mask_broadcastw_epi16(<4 x i64> %a0, i16 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm256_mask_broadcastw_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastw %xmm1, %ymm0 {%k1} ; X32-NEXT: retl ; @@ -222,8 +217,7 @@ define <4 x i64> @test_mm256_maskz_broadcastw_epi16(i16 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm256_maskz_broadcastw_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax -; X32-NEXT: kmovd %eax, %k1 +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ; X32-NEXT: retl ; Index: test/CodeGen/X86/gpr-to-mask.ll =================================================================== --- test/CodeGen/X86/gpr-to-mask.ll +++ test/CodeGen/X86/gpr-to-mask.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq < %s | FileCheck %s define void @test_fcmp_storefloat(i1 %cond, float* %fptr, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) { ; CHECK-LABEL: test_fcmp_storefloat: @@ -7,13 +7,11 @@ ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # BB#1: # %if -; CHECK-NEXT: vcmpeqss %xmm3, %xmm2, %k0 +; CHECK-NEXT: vcmpeqss %xmm3, %xmm2, %k1 ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_2: # %else -; CHECK-NEXT: vcmpeqss %xmm5, %xmm4, %k0 +; CHECK-NEXT: vcmpeqss %xmm5, %xmm4, %k1 ; CHECK-NEXT: .LBB0_3: # %exit -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovss %xmm1, (%rsi) ; CHECK-NEXT: retq @@ -73,13 +71,13 @@ ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: je .LBB2_2 ; CHECK-NEXT: # BB#1: # %if -; CHECK-NEXT: movb (%rcx), %al -; CHECK-NEXT: addb (%rdx), %al +; CHECK-NEXT: kmovb (%rdx), %k0 +; CHECK-NEXT: kmovb (%rcx), %k1 +; CHECK-NEXT: kaddb %k1, %k0, %k1 ; CHECK-NEXT: jmp .LBB2_3 ; CHECK-NEXT: .LBB2_2: # %else -; CHECK-NEXT: movb (%rcx), %al +; CHECK-NEXT: kmovb (%rcx), %k1 ; CHECK-NEXT: .LBB2_3: # %exit -; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovss %xmm1, (%rsi) ; CHECK-NEXT: retq @@ -109,12 +107,11 @@ ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: je .LBB3_2 ; CHECK-NEXT: # BB#1: # %if -; CHECK-NEXT: movb (%rdx), %al +; CHECK-NEXT: kmovb (%rdx), %k1 ; CHECK-NEXT: jmp .LBB3_3 ; CHECK-NEXT: .LBB3_2: # %else -; CHECK-NEXT: movb (%rcx), %al +; CHECK-NEXT: kmovb (%rcx), %k1 ; CHECK-NEXT: .LBB3_3: # %exit -; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovss %xmm1, (%rsi) ; CHECK-NEXT: retq @@ -175,13 +172,12 @@ ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: je .LBB5_2 ; CHECK-NEXT: # BB#1: # %if -; CHECK-NEXT: movb (%rsi), %al -; CHECK-NEXT: addb %al, %al +; CHECK-NEXT: kmovb (%rsi), %k0 +; CHECK-NEXT: kaddb %k0, %k0, %k1 ; CHECK-NEXT: jmp .LBB5_3 ; CHECK-NEXT: .LBB5_2: # %else -; CHECK-NEXT: movb (%rdx), %al +; CHECK-NEXT: kmovb (%rdx), %k1 ; CHECK-NEXT: .LBB5_3: # %exit -; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %ymm1, (%rcx) ; CHECK-NEXT: vzeroupper @@ -253,13 +249,12 @@ ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: je .LBB7_2 ; CHECK-NEXT: # BB#1: # %if -; CHECK-NEXT: movb (%rsi), %al -; CHECK-NEXT: shrb $2, %al +; CHECK-NEXT: kmovb (%rsi), %k0 +; CHECK-NEXT: kshiftrb $2, %k0, %k1 ; CHECK-NEXT: jmp .LBB7_3 ; CHECK-NEXT: .LBB7_2: # %else -; CHECK-NEXT: movb (%rdx), %al +; CHECK-NEXT: kmovb (%rdx), %k1 ; CHECK-NEXT: .LBB7_3: # %exit -; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %ymm1, (%rcx) ; CHECK-NEXT: vzeroupper @@ -292,13 +287,12 @@ ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: je .LBB8_2 ; CHECK-NEXT: # BB#1: # %if -; CHECK-NEXT: movb (%rsi), %al -; CHECK-NEXT: shlb $6, %al +; CHECK-NEXT: kmovb (%rsi), %k0 +; CHECK-NEXT: kshiftlb $6, %k0, %k1 ; CHECK-NEXT: jmp .LBB8_3 ; CHECK-NEXT: .LBB8_2: # %else -; CHECK-NEXT: movb (%rdx), %al +; CHECK-NEXT: kmovb (%rdx), %k1 ; CHECK-NEXT: .LBB8_3: # %exit -; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %ymm1, (%rcx) ; CHECK-NEXT: vzeroupper @@ -328,17 +322,16 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; CHECK-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; CHECK-NEXT: movb (%rsi), %al -; CHECK-NEXT: movb (%rdx), %dl +; CHECK-NEXT: kmovb (%rsi), %k0 +; CHECK-NEXT: kmovb (%rdx), %k1 ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: je .LBB9_2 ; CHECK-NEXT: # BB#1: # %if -; CHECK-NEXT: andb %dl, %al +; CHECK-NEXT: kandb %k1, %k0, %k1 ; CHECK-NEXT: jmp .LBB9_3 ; CHECK-NEXT: .LBB9_2: # %else -; CHECK-NEXT: addb %dl, %al +; CHECK-NEXT: kaddb %k1, %k0, %k1 ; CHECK-NEXT: .LBB9_3: # %exit -; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %ymm1, (%rcx) ; CHECK-NEXT: vzeroupper