diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -45,6 +45,7 @@ X86FixupBWInsts.cpp X86FixupLEAs.cpp X86FixupInstTuning.cpp + X86FixupVectorConstants.cpp X86AvoidStoreForwardingBlocks.cpp X86DynAllocaExpander.cpp X86FixupSetCC.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -60,10 +60,13 @@ /// instructions, in order to eliminate execution delays in some processors. FunctionPass *createX86FixupLEAs(); -/// Return as pass that replaces equivilent slower instructions with faster +/// Return a pass that replaces equivalent slower instructions with faster /// ones. FunctionPass *createX86FixupInstTuning(); +/// Return aspass that reduces the size of vector constant pool loads. +FunctionPass *createX86FixupVectorConstants(); + /// Return a pass that removes redundant LEA instructions and redundant address /// recalculations. FunctionPass *createX86OptimizeLEAs(); @@ -171,6 +174,7 @@ void initializeFixupLEAPassPass(PassRegistry &); void initializeX86ArgumentStackSlotPassPass(PassRegistry &); void initializeX86FixupInstTuningPassPass(PassRegistry &); +void initializeX86FixupVectorConstantsPassPass(PassRegistry &); void initializeWinEHStatePassPass(PassRegistry &); void initializeX86AvoidSFBPassPass(PassRegistry &); void initializeX86AvoidTrailingCallPassPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp @@ -0,0 +1,304 @@ +//===-- X86FixupVectorConstants.cpp - optimize constant generation -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file examines all full size vector constant pool loads and attempts to +// replace them with smaller constant pool entries, including: +// * Converting AVX512 memory-fold instructions to their broadcast-fold form +// * TODO: Broadcasting of full width loads. +// * TODO: Sign/Zero extension of full width loads. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrFoldTables.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineConstantPool.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-fixup-vector-constants" + +STATISTIC(NumInstChanges, "Number of instructions changes"); + +namespace { +class X86FixupVectorConstantsPass : public MachineFunctionPass { +public: + static char ID; + + X86FixupVectorConstantsPass() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { + return "X86 Fixup Vector Constants"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &I); + + // This pass runs after regalloc and doesn't support VReg operands. + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + const X86InstrInfo *TII = nullptr; + const X86Subtarget *ST = nullptr; + const MCSchedModel *SM = nullptr; +}; +} // end anonymous namespace + +char X86FixupVectorConstantsPass::ID = 0; + +INITIALIZE_PASS(X86FixupVectorConstantsPass, DEBUG_TYPE, DEBUG_TYPE, false, false) + +FunctionPass *llvm::createX86FixupVectorConstants() { + return new X86FixupVectorConstantsPass(); +} + +static const Constant *getConstantFromPool(const MachineInstr &MI, + const MachineOperand &Op) { + if (!Op.isCPI() || Op.getOffset() != 0) + return nullptr; + + ArrayRef Constants = + MI.getParent()->getParent()->getConstantPool()->getConstants(); + const MachineConstantPoolEntry &ConstantEntry = Constants[Op.getIndex()]; + + // Bail if this is a machine constant pool entry, we won't be able to dig out + // anything useful. + if (ConstantEntry.isMachineConstantPoolEntry()) + return nullptr; + + return ConstantEntry.Val.ConstVal; +} + +// Attempt to extract the full width of bits data from the constant. +static std::optional extractConstantBits(const Constant *C) { + unsigned NumBits = C->getType()->getPrimitiveSizeInBits(); + + if (auto *CInt = dyn_cast(C)) + return CInt->getValue(); + + if (auto *CFP = dyn_cast(C)) + return CFP->getValue().bitcastToAPInt(); + + if (auto *CV = dyn_cast(C)) { + if (auto *CVSplat = CV->getSplatValue(/*AllowUndefs*/ true)) { + if (std::optional Bits = extractConstantBits(CVSplat)) { + assert((NumBits % Bits->getBitWidth()) == 0 && "Illegal splat"); + return APInt::getSplat(NumBits, *Bits); + } + } + } + + if (auto *CDS = dyn_cast(C)) { + bool IsInteger = CDS->getElementType()->isIntegerTy(); + bool IsFloat = CDS->getElementType()->isHalfTy() || + CDS->getElementType()->isFloatTy() || + CDS->getElementType()->isDoubleTy(); + if (IsInteger || IsFloat) { + APInt Bits = APInt::getZero(NumBits); + unsigned EltBits = CDS->getElementType()->getPrimitiveSizeInBits(); + for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) { + if (IsInteger) + Bits.insertBits(CDS->getElementAsAPInt(I), I * EltBits); + else + Bits.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(), + I * EltBits); + } + return Bits; + } + } + + return std::nullopt; +} + +// Attempt to compute the splat width of bits data by normalizing the splat to +// remove undefs. +static std::optional isSplatableConstant(const Constant *C, + unsigned SplatBitWidth) { + if (std::optional Bits = extractConstantBits(C)) + if (Bits->isSplat(SplatBitWidth)) + return Bits->trunc(SplatBitWidth); + + // Detect general splats with undefs. + // TODO: Do we need to handle NumEltsBits > SplatBitWidth splitting? + if (auto *CV = dyn_cast(C)) { + unsigned NumOps = CV->getNumOperands(); + unsigned NumEltsBits = C->getType()->getScalarSizeInBits(); + unsigned NumScaleOps = SplatBitWidth / NumEltsBits; + if ((SplatBitWidth % NumEltsBits) == 0) { + // Collect the elements and ensure that within the repeated splat sequence + // they either match or are undef. + SmallVector Sequence(NumScaleOps, nullptr); + for (unsigned Idx = 0; Idx != NumOps; ++Idx) { + if (Constant *Elt = CV->getAggregateElement(Idx)) { + if (isa(Elt)) + continue; + unsigned SplatIdx = Idx % NumScaleOps; + if (!Sequence[SplatIdx] || Sequence[SplatIdx] == Elt) { + Sequence[SplatIdx] = Elt; + continue; + } + } + return std::nullopt; + } + // Extract the constant bits forming the splat and insert into the bits + // data, leave undef as zero. + APInt SplatBits = APInt::getZero(SplatBitWidth); + for (unsigned I = 0; I != NumScaleOps; ++I) { + if (!Sequence[I]) + continue; + if (std::optional Bits = extractConstantBits(Sequence[I])) { + SplatBits.insertBits(*Bits, I * Bits->getBitWidth()); + continue; + } + return std::nullopt; + } + return SplatBits; + } + } + + return std::nullopt; +} + +// Attempt to rebuild a normalized splat vector constant of the requested splat +// width, built up of potentially smaller scalar values. +// NOTE: We don't always bother converting to scalars if the vector length is 1. +static Constant *rebuildSplatableConstant(const Constant *C, + unsigned SplatBitWidth) { + std::optional Splat = isSplatableConstant(C, SplatBitWidth); + if (!Splat) + return nullptr; + + // Determine scalar size to use for the constant splat vector, clamping as we + // might have found a splat smaller than the original constant data. + const Type *OriginalType = C->getType(); + Type *SclTy = OriginalType->getScalarType(); + unsigned NumSclBits = SclTy->getPrimitiveSizeInBits(); + NumSclBits = std::min(NumSclBits, SplatBitWidth); + + if (NumSclBits == 8) { + SmallVector RawBits; + for (unsigned I = 0; I != SplatBitWidth; I += 8) + RawBits.push_back(Splat->extractBits(8, I).getZExtValue()); + return ConstantDataVector::get(OriginalType->getContext(), RawBits); + } + + if (NumSclBits == 16) { + SmallVector RawBits; + for (unsigned I = 0; I != SplatBitWidth; I += 16) + RawBits.push_back(Splat->extractBits(16, I).getZExtValue()); + if (SclTy->isHalfTy() || SclTy->isBFloatTy()) + return ConstantDataVector::getFP(SclTy, RawBits); + return ConstantDataVector::get(OriginalType->getContext(), RawBits); + } + + if (NumSclBits == 32) { + SmallVector RawBits; + for (unsigned I = 0; I != SplatBitWidth; I += 32) + RawBits.push_back(Splat->extractBits(32, I).getZExtValue()); + if (SclTy->isFloatTy()) + return ConstantDataVector::getFP(SclTy, RawBits); + return ConstantDataVector::get(OriginalType->getContext(), RawBits); + } + + // Fallback to i64 / double. + SmallVector RawBits; + for (unsigned I = 0; I != SplatBitWidth; I += 64) + RawBits.push_back(Splat->extractBits(64, I).getZExtValue()); + if (SclTy->isDoubleTy()) + return ConstantDataVector::getFP(SclTy, RawBits); + return ConstantDataVector::get(OriginalType->getContext(), RawBits); +} + +bool X86FixupVectorConstantsPass::processInstruction( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &I) { + MachineInstr &MI = *I; + unsigned Opc = MI.getOpcode(); + MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool(); + + // Currently only AVX512 instructions are handled. + if (!ST->hasAVX512()) + return false; + + auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128, + unsigned OpBcst64, unsigned OpBcst32, + unsigned OpBcst16, unsigned OpBcst8, + unsigned OperandNo) { + assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) && + "Unexpected number of operands!"); + + MachineOperand &CstOp = MI.getOperand(OperandNo + X86::AddrDisp); + if (auto *C = getConstantFromPool(MI, CstOp)) { + // Attempt to detect a suitable splat from increasing splat widths. + std::pair Broadcasts[] = { + {8, OpBcst8}, {16, OpBcst16}, {32, OpBcst32}, + {64, OpBcst64}, {128, OpBcst128}, {256, OpBcst256}, + }; + for (auto [BitWidth, OpBcst] : Broadcasts) { + if (OpBcst) { + // Construct a suitable splat constant and adjust the MI to + // use the new constant pool entry. + if (Constant *NewCst = rebuildSplatableConstant(C, BitWidth)) { + unsigned NewCPI = + CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8)); + MI.setDesc(TII->get(OpBcst)); + CstOp.setIndex(NewCPI); + return true; + } + } + } + } + return false; + }; + + // Attempt to find a mapping from a full width memory-fold instruction to a + // broadcast-fold instruction variant. + for (unsigned BitWidth : {32, 64}) { + if (const X86MemoryFoldTableEntry *Mem2Bcst = + llvm::lookupBroadcastFoldTable(Opc, BitWidth)) { + unsigned OpNo = Mem2Bcst->Flags & TB_INDEX_MASK; + switch (BitWidth) { + case 32: + if (ConvertToBroadcast(0, 0, 0, Mem2Bcst->DstOp, 0, 0, OpNo)) + return true; + break; + case 64: + if (ConvertToBroadcast(0, 0, Mem2Bcst->DstOp, 0, 0, 0, OpNo)) + return true; + break; + } + } + } + + return false; +} + +bool X86FixupVectorConstantsPass::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "Start X86FixupVectorConstants\n";); + bool Changed = false; + ST = &MF.getSubtarget(); + TII = ST->getInstrInfo(); + SM = &ST->getSchedModel(); + + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { + if (processInstruction(MF, MBB, I)) { + ++NumInstChanges; + Changed = true; + } + } + } + LLVM_DEBUG(dbgs() << "End X86FixupVectorConstants\n";); + return Changed; +} diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.h b/llvm/lib/Target/X86/X86InstrFoldTables.h --- a/llvm/lib/Target/X86/X86InstrFoldTables.h +++ b/llvm/lib/Target/X86/X86InstrFoldTables.h @@ -47,6 +47,11 @@ // Look up the memory unfolding table entry for this instruction. const X86MemoryFoldTableEntry *lookupUnfoldTable(unsigned MemOp); +// Look up the broadcast memory folding table entry for this instruction from +// the regular memory instruction. +const X86MemoryFoldTableEntry *lookupBroadcastFoldTable(unsigned MemOp, + unsigned BroadcastBits); + } // namespace llvm #endif diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -30,6 +30,17 @@ { X86::VADDPSZ128rr, X86::VADDPSZ128rmb, TB_BCAST_SS }, { X86::VADDPSZ256rr, X86::VADDPSZ256rmb, TB_BCAST_SS }, { X86::VADDPSZrr, X86::VADDPSZrmb, TB_BCAST_SS }, + { X86::VANDNPDZ128rr, X86::VANDNPDZ128rmb, TB_BCAST_SD }, + { X86::VANDNPDZ256rr, X86::VANDNPDZ256rmb, TB_BCAST_SD }, + { X86::VANDNPDZrr, X86::VANDNPDZrmb, TB_BCAST_SD }, + { X86::VANDNPSZ128rr, X86::VANDNPSZ128rmb, TB_BCAST_SS }, + { X86::VANDNPSZ256rr, X86::VANDNPSZ256rmb, TB_BCAST_SS }, + { X86::VANDPDZ128rr, X86::VANDPDZ128rmb, TB_BCAST_SD }, + { X86::VANDPDZ256rr, X86::VANDPDZ256rmb, TB_BCAST_SD }, + { X86::VANDPDZrr, X86::VANDPDZrmb, TB_BCAST_SD }, + { X86::VANDPSZ128rr, X86::VANDPSZ128rmb, TB_BCAST_SS }, + { X86::VANDPSZ256rr, X86::VANDPSZ256rmb, TB_BCAST_SS }, + { X86::VANDPSZrr, X86::VANDPSZrmb, TB_BCAST_SS }, { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmbi, TB_BCAST_SD }, { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmbi, TB_BCAST_SD }, { X86::VCMPPDZrri, X86::VCMPPDZrmbi, TB_BCAST_SD }, @@ -72,6 +83,12 @@ { X86::VMULPSZ128rr, X86::VMULPSZ128rmb, TB_BCAST_SS }, { X86::VMULPSZ256rr, X86::VMULPSZ256rmb, TB_BCAST_SS }, { X86::VMULPSZrr, X86::VMULPSZrmb, TB_BCAST_SS }, + { X86::VORPDZ128rr, X86::VORPDZ128rmb, TB_BCAST_SD }, + { X86::VORPDZ256rr, X86::VORPDZ256rmb, TB_BCAST_SD }, + { X86::VORPDZrr, X86::VORPDZrmb, TB_BCAST_SD }, + { X86::VORPSZ128rr, X86::VORPSZ128rmb, TB_BCAST_SS }, + { X86::VORPSZ256rr, X86::VORPSZ256rmb, TB_BCAST_SS }, + { X86::VORPSZrr, X86::VORPSZrmb, TB_BCAST_SS }, { X86::VPADDDZ128rr, X86::VPADDDZ128rmb, TB_BCAST_D }, { X86::VPADDDZ256rr, X86::VPADDDZ256rmb, TB_BCAST_D }, { X86::VPADDDZrr, X86::VPADDDZrmb, TB_BCAST_D }, @@ -174,6 +191,12 @@ { X86::VSUBPSZ128rr, X86::VSUBPSZ128rmb, TB_BCAST_SS }, { X86::VSUBPSZ256rr, X86::VSUBPSZ256rmb, TB_BCAST_SS }, { X86::VSUBPSZrr, X86::VSUBPSZrmb, TB_BCAST_SS }, + { X86::VXORPDZ128rr, X86::VXORPDZ128rmb, TB_BCAST_SD }, + { X86::VXORPDZ256rr, X86::VXORPDZ256rmb, TB_BCAST_SD }, + { X86::VXORPDZrr, X86::VXORPDZrmb, TB_BCAST_SD }, + { X86::VXORPSZ128rr, X86::VXORPSZ128rmb, TB_BCAST_SS }, + { X86::VXORPSZ256rr, X86::VXORPSZ256rmb, TB_BCAST_SS }, + { X86::VXORPSZrr, X86::VXORPSZrmb, TB_BCAST_SS }, }; static const X86MemoryFoldTableEntry BroadcastFoldTable3[] = { @@ -293,6 +316,68 @@ { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmbi, TB_BCAST_Q }, }; +// Table to map instructions safe to broadcast using a different width from the +// element width. +static const X86MemoryFoldTableEntry BroadcastSizeFoldTable2[] = { + { X86::VANDNPDZ128rr, X86::VANDNPSZ128rmb, TB_BCAST_SS }, + { X86::VANDNPDZ256rr, X86::VANDNPSZ256rmb, TB_BCAST_SS }, + { X86::VANDNPDZrr, X86::VANDNPSZrmb, TB_BCAST_SS }, + { X86::VANDNPSZ128rr, X86::VANDNPDZ128rmb, TB_BCAST_SD }, + { X86::VANDNPSZ256rr, X86::VANDNPDZ256rmb, TB_BCAST_SD }, + { X86::VANDNPSZrr, X86::VANDNPDZrmb, TB_BCAST_SD }, + { X86::VANDPDZ128rr, X86::VANDPSZ128rmb, TB_BCAST_SS }, + { X86::VANDPDZ256rr, X86::VANDPSZ256rmb, TB_BCAST_SS }, + { X86::VANDPDZrr, X86::VANDPSZrmb, TB_BCAST_SS }, + { X86::VANDPSZ128rr, X86::VANDPDZ128rmb, TB_BCAST_SD }, + { X86::VANDPSZ256rr, X86::VANDPDZ256rmb, TB_BCAST_SD }, + { X86::VANDPSZrr, X86::VANDPDZrmb, TB_BCAST_SD }, + { X86::VORPDZ128rr, X86::VORPSZ128rmb, TB_BCAST_SS }, + { X86::VORPDZ256rr, X86::VORPSZ256rmb, TB_BCAST_SS }, + { X86::VORPDZrr, X86::VORPSZrmb, TB_BCAST_SS }, + { X86::VORPSZ128rr, X86::VORPDZ128rmb, TB_BCAST_SD }, + { X86::VORPSZ256rr, X86::VORPDZ256rmb, TB_BCAST_SD }, + { X86::VORPSZrr, X86::VORPDZrmb, TB_BCAST_SD }, + { X86::VPANDDZ128rr, X86::VPANDQZ128rmb, TB_BCAST_Q }, + { X86::VPANDDZ256rr, X86::VPANDQZ256rmb, TB_BCAST_Q }, + { X86::VPANDDZrr, X86::VPANDQZrmb, TB_BCAST_Q }, + { X86::VPANDNDZ128rr, X86::VPANDNQZ128rmb, TB_BCAST_Q }, + { X86::VPANDNDZ256rr, X86::VPANDNQZ256rmb, TB_BCAST_Q }, + { X86::VPANDNDZrr, X86::VPANDNQZrmb, TB_BCAST_Q }, + { X86::VPANDNQZ128rr, X86::VPANDNDZ128rmb, TB_BCAST_D }, + { X86::VPANDNQZ256rr, X86::VPANDNDZ256rmb, TB_BCAST_D }, + { X86::VPANDNQZrr, X86::VPANDNDZrmb, TB_BCAST_D }, + { X86::VPANDQZ128rr, X86::VPANDDZ128rmb, TB_BCAST_D }, + { X86::VPANDQZ256rr, X86::VPANDDZ256rmb, TB_BCAST_D }, + { X86::VPANDQZrr, X86::VPANDDZrmb, TB_BCAST_D }, + { X86::VPORDZ128rr, X86::VPORQZ128rmb, TB_BCAST_Q }, + { X86::VPORDZ256rr, X86::VPORQZ256rmb, TB_BCAST_Q }, + { X86::VPORDZrr, X86::VPORQZrmb, TB_BCAST_Q }, + { X86::VPORQZ128rr, X86::VPORDZ128rmb, TB_BCAST_D }, + { X86::VPORQZ256rr, X86::VPORDZ256rmb, TB_BCAST_D }, + { X86::VPORQZrr, X86::VPORDZrmb, TB_BCAST_D }, + { X86::VPXORDZ128rr, X86::VPXORQZ128rmb, TB_BCAST_Q }, + { X86::VPXORDZ256rr, X86::VPXORQZ256rmb, TB_BCAST_Q }, + { X86::VPXORDZrr, X86::VPXORQZrmb, TB_BCAST_Q }, + { X86::VPXORQZ128rr, X86::VPXORDZ128rmb, TB_BCAST_D }, + { X86::VPXORQZ256rr, X86::VPXORDZ256rmb, TB_BCAST_D }, + { X86::VPXORQZrr, X86::VPXORDZrmb, TB_BCAST_D }, + { X86::VXORPDZ128rr, X86::VXORPSZ128rmb, TB_BCAST_SS }, + { X86::VXORPDZ256rr, X86::VXORPSZ256rmb, TB_BCAST_SS }, + { X86::VXORPDZrr, X86::VXORPSZrmb, TB_BCAST_SS }, + { X86::VXORPSZ128rr, X86::VXORPDZ128rmb, TB_BCAST_SD }, + { X86::VXORPSZ256rr, X86::VXORPDZ256rmb, TB_BCAST_SD }, + { X86::VXORPSZrr, X86::VXORPDZrmb, TB_BCAST_SD }, +}; + +static const X86MemoryFoldTableEntry BroadcastSizeFoldTable3[] = { + { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGQZ128rmbi, TB_BCAST_Q }, + { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGQZ256rmbi, TB_BCAST_Q }, + { X86::VPTERNLOGDZrri, X86::VPTERNLOGQZrmbi, TB_BCAST_Q }, + { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGDZ128rmbi, TB_BCAST_D }, + { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGDZ256rmbi, TB_BCAST_D }, + { X86::VPTERNLOGQZrri, X86::VPTERNLOGDZrmbi, TB_BCAST_D }, +}; + static const X86MemoryFoldTableEntry * lookupFoldTableImpl(ArrayRef Table, unsigned RegOp) { #ifndef NDEBUG @@ -339,6 +424,16 @@ std::end(BroadcastFoldTable3)) == std::end(BroadcastFoldTable3) && "BroadcastFoldTable3 is not sorted and unique!"); + assert(llvm::is_sorted(BroadcastSizeFoldTable2) && + std::adjacent_find(std::begin(BroadcastSizeFoldTable2), + std::end(BroadcastSizeFoldTable2)) == + std::end(BroadcastSizeFoldTable2) && + "BroadcastSizeFoldTable2 is not sorted and unique!"); + assert(llvm::is_sorted(BroadcastSizeFoldTable3) && + std::adjacent_find(std::begin(BroadcastSizeFoldTable3), + std::end(BroadcastSizeFoldTable3)) == + std::end(BroadcastSizeFoldTable3) && + "BroadcastSizeFoldTable3 is not sorted and unique!"); FoldTablesChecked.store(true, std::memory_order_relaxed); } #endif @@ -444,3 +539,85 @@ return nullptr; } +namespace { + +// This class stores the memory -> broadcast folding tables. It is instantiated +// as a function scope static variable to lazily init the folding table. +struct X86MemBroadcastFoldTable { + // Stores memory broadcast folding tables entries sorted by opcode. + std::vector Table; + + X86MemBroadcastFoldTable() { + // Broadcast tables. + for (const X86MemoryFoldTableEntry &Reg2Bcst : BroadcastFoldTable2) { + unsigned RegOp = Reg2Bcst.KeyOp; + unsigned BcstOp = Reg2Bcst.DstOp; + if (const X86MemoryFoldTableEntry *Reg2Mem = lookupFoldTable(RegOp, 2)) { + unsigned MemOp = Reg2Mem->DstOp; + uint16_t Flags = Reg2Mem->Flags | Reg2Bcst.Flags | TB_INDEX_2 | + TB_FOLDED_LOAD | TB_FOLDED_BCAST; + Table.push_back({MemOp, BcstOp, Flags}); + } + } + for (const X86MemoryFoldTableEntry &Reg2Bcst : BroadcastSizeFoldTable2) { + unsigned RegOp = Reg2Bcst.KeyOp; + unsigned BcstOp = Reg2Bcst.DstOp; + if (const X86MemoryFoldTableEntry *Reg2Mem = lookupFoldTable(RegOp, 2)) { + unsigned MemOp = Reg2Mem->DstOp; + uint16_t Flags = Reg2Mem->Flags | Reg2Bcst.Flags | TB_INDEX_2 | + TB_FOLDED_LOAD | TB_FOLDED_BCAST; + Table.push_back({MemOp, BcstOp, Flags}); + } + } + + for (const X86MemoryFoldTableEntry &Reg2Bcst : BroadcastFoldTable3) { + unsigned RegOp = Reg2Bcst.KeyOp; + unsigned BcstOp = Reg2Bcst.DstOp; + if (const X86MemoryFoldTableEntry *Reg2Mem = lookupFoldTable(RegOp, 3)) { + unsigned MemOp = Reg2Mem->DstOp; + uint16_t Flags = Reg2Mem->Flags | Reg2Bcst.Flags | TB_INDEX_3 | + TB_FOLDED_LOAD | TB_FOLDED_BCAST; + Table.push_back({MemOp, BcstOp, Flags}); + } + } + for (const X86MemoryFoldTableEntry &Reg2Bcst : BroadcastSizeFoldTable3) { + unsigned RegOp = Reg2Bcst.KeyOp; + unsigned BcstOp = Reg2Bcst.DstOp; + if (const X86MemoryFoldTableEntry *Reg2Mem = lookupFoldTable(RegOp, 3)) { + unsigned MemOp = Reg2Mem->DstOp; + uint16_t Flags = Reg2Mem->Flags | Reg2Bcst.Flags | TB_INDEX_3 | + TB_FOLDED_LOAD | TB_FOLDED_BCAST; + Table.push_back({MemOp, BcstOp, Flags}); + } + } + + // Sort the memory->broadcast fold table. + array_pod_sort(Table.begin(), Table.end()); + } +}; +} // namespace + +static bool matchBroadcastSize(const X86MemoryFoldTableEntry &Entry, + unsigned BroadcastBits) { + switch (Entry.Flags & TB_BCAST_MASK) { + case TB_BCAST_SD: + case TB_BCAST_Q: + return BroadcastBits == 64; + case TB_BCAST_SS: + case TB_BCAST_D: + return BroadcastBits == 32; + } + return false; +} + +const X86MemoryFoldTableEntry * +llvm::lookupBroadcastFoldTable(unsigned MemOp, unsigned BroadcastBits) { + static X86MemBroadcastFoldTable MemBroadcastFoldTable; + auto &Table = MemBroadcastFoldTable.Table; + for (auto I = llvm::lower_bound(Table, MemOp); + I != Table.end() && I->KeyOp == MemOp; ++I) { + if (matchBroadcastSize(*I, BroadcastBits)) + return &*I; + } + return nullptr; +} diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -574,6 +574,7 @@ addPass(createX86PadShortFunctions()); addPass(createX86FixupLEAs()); addPass(createX86FixupInstTuning()); + addPass(createX86FixupVectorConstants()); } addPass(createX86EvexToVexInsts()); addPass(createX86DiscriminateMemOpsPass()); diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -286,7 +286,7 @@ ; SKX-NEXT: vpmovm2w %k0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _func8xi1 -; SKX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; SKX-NEXT: popq %rax ; SKX-NEXT: retq ; @@ -310,7 +310,7 @@ ; FASTISEL-NEXT: vpmovm2w %k0, %xmm0 ; FASTISEL-NEXT: vzeroupper ; FASTISEL-NEXT: callq _func8xi1 -; FASTISEL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FASTISEL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; FASTISEL-NEXT: popq %rax ; FASTISEL-NEXT: retq %cmpRes = icmp sgt <8 x i32>%a, %b diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -2895,7 +2895,7 @@ ; KNL-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm2 ; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; KNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_64xi1_to_64xi8: @@ -2911,7 +2911,7 @@ ; AVX512DQNOBW-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm2 ; AVX512DQNOBW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQNOBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQNOBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <64 x i8> %x, %y %1 = zext <64 x i1> %mask to <64 x i8> @@ -2926,7 +2926,7 @@ ; KNL-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 ; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; KNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32xi1_to_32xi16: @@ -2943,7 +2943,7 @@ ; AVX512DQNOBW-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 ; AVX512DQNOBW-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQNOBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQNOBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <32 x i16> %x, %y %1 = zext <32 x i1> %mask to <32 x i16> @@ -2994,7 +2994,7 @@ ; AVX512DQNOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512DQNOBW-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQNOBW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQNOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQNOBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <32 x i16> %x, %y %1 = zext <32 x i1> %mask to <32 x i8> diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll --- a/llvm/test/CodeGen/X86/avx512-logic.ll +++ b/llvm/test/CodeGen/X86/avx512-logic.ll @@ -889,7 +889,7 @@ define <16 x i32> @ternlog_or_and_mask(<16 x i32> %x, <16 x i32> %y) { ; ALL-LABEL: ternlog_or_and_mask: ; ALL: ## %bb.0: -; ALL-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; ALL-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; ALL-NEXT: retq %a = and <16 x i32> %x, %b = or <16 x i32> %a, %y @@ -899,7 +899,7 @@ define <8 x i64> @ternlog_xor_and_mask(<8 x i64> %x, <8 x i64> %y) { ; ALL-LABEL: ternlog_xor_and_mask: ; ALL: ## %bb.0: -; ALL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; ALL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; ALL-NEXT: retq %a = and <8 x i64> %x, %b = xor <8 x i64> %a, %y @@ -909,7 +909,7 @@ define <16 x i32> @ternlog_maskz_or_and_mask(<16 x i32> %x, <16 x i32> %y, <16 x i32> %mask) { ; ALL-LABEL: ternlog_maskz_or_and_mask: ; ALL: ## %bb.0: -; ALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; ALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm3 ; ALL-NEXT: vpsrad $31, %zmm2, %zmm0 ; ALL-NEXT: vpternlogd $224, %zmm1, %zmm3, %zmm0 ; ALL-NEXT: retq @@ -923,7 +923,7 @@ define <8 x i64> @ternlog_maskz_xor_and_mask(<8 x i64> %x, <8 x i64> %y, <8 x i64> %mask) { ; ALL-LABEL: ternlog_maskz_xor_and_mask: ; ALL: ## %bb.0: -; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm3 ; ALL-NEXT: vpsraq $63, %zmm2, %zmm0 ; ALL-NEXT: vpternlogq $96, %zmm1, %zmm3, %zmm0 ; ALL-NEXT: retq @@ -939,14 +939,14 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; KNL-NEXT: vpcmpgtd %zmm2, %zmm3, %k1 -; KNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2 ; KNL-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: ternlog_maskx_or_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %zmm2, %k1 -; SKX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; SKX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2 ; SKX-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <16 x i32> %mask, zeroinitializer @@ -961,7 +961,7 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; KNL-NEXT: vpcmpgtd %zmm2, %zmm3, %k1 -; KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; KNL-NEXT: vpord %zmm1, %zmm0, %zmm1 {%k1} ; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL-NEXT: retq @@ -969,7 +969,7 @@ ; SKX-LABEL: ternlog_masky_or_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %zmm2, %k1 -; SKX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; SKX-NEXT: vorps %zmm1, %zmm0, %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq @@ -985,14 +985,14 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; KNL-NEXT: vpcmpgtq %zmm2, %zmm3, %k1 -; KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; KNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm2 ; KNL-NEXT: vpxorq %zmm1, %zmm2, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: ternlog_maskx_xor_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %zmm2, %k1 -; SKX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; SKX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm2 ; SKX-NEXT: vxorpd %zmm1, %zmm2, %zmm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <8 x i64> %mask, zeroinitializer @@ -1007,7 +1007,7 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; KNL-NEXT: vpcmpgtq %zmm2, %zmm3, %k1 -; KNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; KNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm1 {%k1} ; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL-NEXT: retq @@ -1015,7 +1015,7 @@ ; SKX-LABEL: ternlog_masky_xor_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %zmm2, %k1 -; SKX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm1 {%k1} ; SKX-NEXT: vmovapd %zmm1, %zmm0 ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll @@ -740,7 +740,7 @@ ; CHECK-LABEL: test_u1tofp2: ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovqw %xmm0, %xmm0 -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0 ; CHECK-NEXT: retq %res = uitofp <2 x i1> %arg0 to <2 x half> diff --git a/llvm/test/CodeGen/X86/avx512vl-logic.ll b/llvm/test/CodeGen/X86/avx512vl-logic.ll --- a/llvm/test/CodeGen/X86/avx512vl-logic.ll +++ b/llvm/test/CodeGen/X86/avx512vl-logic.ll @@ -1039,7 +1039,7 @@ define <4 x i32> @ternlog_or_and_mask(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: ternlog_or_and_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; CHECK-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; CHECK-NEXT: retq %a = and <4 x i32> %x, %b = or <4 x i32> %a, %y @@ -1049,7 +1049,7 @@ define <8 x i32> @ternlog_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y) { ; CHECK-LABEL: ternlog_or_and_mask_ymm: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; CHECK-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; CHECK-NEXT: retq %a = and <8 x i32> %x, %b = or <8 x i32> %a, %y @@ -1059,7 +1059,7 @@ define <2 x i64> @ternlog_xor_and_mask(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: ternlog_xor_and_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; CHECK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; CHECK-NEXT: retq %a = and <2 x i64> %x, %b = xor <2 x i64> %a, %y @@ -1069,7 +1069,7 @@ define <4 x i64> @ternlog_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y) { ; CHECK-LABEL: ternlog_xor_and_mask_ymm: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; CHECK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; CHECK-NEXT: retq %a = and <4 x i64> %x, %b = xor <4 x i64> %a, %y @@ -1079,7 +1079,7 @@ define <4 x i32> @ternlog_maskz_or_and_mask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %mask) { ; CHECK-LABEL: ternlog_maskz_or_and_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 ; CHECK-NEXT: vpsrad $31, %xmm3, %xmm0 ; CHECK-NEXT: vpternlogd $224, %xmm1, %xmm2, %xmm0 ; CHECK-NEXT: retq @@ -1093,7 +1093,7 @@ define <8 x i32> @ternlog_maskz_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y, <8 x i32> %mask) { ; CHECK-LABEL: ternlog_maskz_or_and_mask_ymm: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm3 ; CHECK-NEXT: vpsrad $31, %ymm2, %ymm0 ; CHECK-NEXT: vpternlogd $224, %ymm1, %ymm3, %ymm0 ; CHECK-NEXT: retq @@ -1107,7 +1107,7 @@ define <2 x i64> @ternlog_maskz_xor_and_mask(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) { ; CHECK-LABEL: ternlog_maskz_xor_and_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm3 ; CHECK-NEXT: vpsraq $63, %xmm2, %xmm0 ; CHECK-NEXT: vpternlogq $96, %xmm1, %xmm3, %xmm0 ; CHECK-NEXT: retq @@ -1121,7 +1121,7 @@ define <4 x i64> @ternlog_maskz_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y, <4 x i64> %mask) { ; CHECK-LABEL: ternlog_maskz_xor_and_mask_ymm: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; CHECK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3 ; CHECK-NEXT: vpsraq $63, %ymm2, %ymm0 ; CHECK-NEXT: vpternlogq $96, %ymm1, %ymm3, %ymm0 ; CHECK-NEXT: retq @@ -1137,14 +1137,14 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 -; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 ; KNL-NEXT: vpord %xmm1, %xmm2, %xmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: ternlog_maskx_or_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %xmm3, %k1 -; SKX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; SKX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 ; SKX-NEXT: vorps %xmm1, %xmm2, %xmm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <4 x i32> %mask, zeroinitializer @@ -1159,14 +1159,14 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; KNL-NEXT: vpcmpgtd %ymm2, %ymm3, %k1 -; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm2 ; KNL-NEXT: vpord %ymm1, %ymm2, %ymm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: ternlog_maskx_or_and_mask_ymm: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %ymm2, %k1 -; SKX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; SKX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm2 ; SKX-NEXT: vorps %ymm1, %ymm2, %ymm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <8 x i32> %mask, zeroinitializer @@ -1181,14 +1181,14 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; KNL-NEXT: vpcmpgtq %xmm2, %xmm3, %k1 -; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; KNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm2 ; KNL-NEXT: vpxorq %xmm1, %xmm2, %xmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: ternlog_maskx_xor_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; SKX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm2 ; SKX-NEXT: vxorpd %xmm1, %xmm2, %xmm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <2 x i64> %mask, zeroinitializer @@ -1203,14 +1203,14 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; KNL-NEXT: vpcmpgtq %ymm2, %ymm3, %k1 -; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; KNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2 ; KNL-NEXT: vpxorq %ymm1, %ymm2, %ymm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: ternlog_maskx_xor_and_mask_ymm: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %ymm2, %k1 -; SKX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; SKX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2 ; SKX-NEXT: vxorpd %ymm1, %ymm2, %ymm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <4 x i64> %mask, zeroinitializer @@ -1225,7 +1225,7 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 -; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; KNL-NEXT: vpord %xmm1, %xmm0, %xmm1 {%k1} ; KNL-NEXT: vmovdqa %xmm1, %xmm0 ; KNL-NEXT: retq @@ -1233,7 +1233,7 @@ ; SKX-LABEL: ternlog_masky_or_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %xmm3, %k1 -; SKX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; SKX-NEXT: vorps %xmm1, %xmm0, %xmm1 {%k1} ; SKX-NEXT: vmovaps %xmm1, %xmm0 ; SKX-NEXT: retq @@ -1249,14 +1249,14 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; KNL-NEXT: vpcmpgtd %ymm2, %ymm3, %k1 -; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm2 ; KNL-NEXT: vpord %ymm1, %ymm2, %ymm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: ternlog_masky_or_and_mask_ymm: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %ymm2, %k1 -; SKX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; SKX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm2 ; SKX-NEXT: vorps %ymm1, %ymm2, %ymm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <8 x i32> %mask, zeroinitializer @@ -1271,7 +1271,7 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; KNL-NEXT: vpcmpgtq %xmm2, %xmm3, %k1 -; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; KNL-NEXT: vpxorq %xmm1, %xmm0, %xmm1 {%k1} ; KNL-NEXT: vmovdqa %xmm1, %xmm0 ; KNL-NEXT: retq @@ -1279,7 +1279,7 @@ ; SKX-LABEL: ternlog_masky_xor_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; SKX-NEXT: vxorpd %xmm1, %xmm0, %xmm1 {%k1} ; SKX-NEXT: vmovapd %xmm1, %xmm0 ; SKX-NEXT: retq @@ -1295,7 +1295,7 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; KNL-NEXT: vpcmpgtq %ymm2, %ymm3, %k1 -; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; KNL-NEXT: vpxorq %ymm1, %ymm0, %ymm1 {%k1} ; KNL-NEXT: vmovdqa %ymm1, %ymm0 ; KNL-NEXT: retq @@ -1303,7 +1303,7 @@ ; SKX-LABEL: ternlog_masky_xor_and_mask_ymm: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %ymm2, %k1 -; SKX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; SKX-NEXT: vxorpd %ymm1, %ymm0, %ymm1 {%k1} ; SKX-NEXT: vmovapd %ymm1, %ymm0 ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -1086,7 +1086,7 @@ ; ; AVX512-LABEL: trunc_v64i8_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -546,12 +546,26 @@ ; SSE-NEXT: paddb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: PR34620: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: PR34620: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR34620: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: PR34620: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = lshr <16 x i8> %a0, %2 = and <16 x i8> %1, %3 = add <16 x i8> %2, %a1 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -249,11 +249,23 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: retq ; -; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pos1: -; AVX2ORLATER: # %bb.0: -; AVX2ORLATER-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2ORLATER-NEXT: retq +; AVX2-LABEL: combine_vec_sdiv_by_pos1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_vec_sdiv_by_pos1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: combine_vec_sdiv_by_pos1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: retq ; ; XOP-LABEL: combine_vec_sdiv_by_pos1: ; XOP: # %bb.0: diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll --- a/llvm/test/CodeGen/X86/dpbusd_const.ll +++ b/llvm/test/CodeGen/X86/dpbusd_const.ll @@ -89,7 +89,7 @@ ; AVX512VLVNNI-LABEL: mul_4xi4_cz: ; AVX512VLVNNI: # %bb.0: # %entry ; AVX512VLVNNI-NEXT: vpmovdb %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll --- a/llvm/test/CodeGen/X86/dpbusd_i4.ll +++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll @@ -29,7 +29,7 @@ define i32 @mul_i4i8(<16 x i4> %a, <16 x i8> %b, i32 %c) { ; CHECK-LABEL: mul_i4i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] @@ -54,9 +54,9 @@ ; CHECK-NEXT: vpsllw $4, %xmm1, %xmm1 ; CHECK-NEXT: vpsrlw $4, %xmm1, %xmm1 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; CHECK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; CHECK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 ; CHECK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll --- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll @@ -59,7 +59,7 @@ ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $7, %xmm1, %xmm1 ; GFNIAVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; GFNIAVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; GFNIAVX512-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) ret <16 x i8> %res @@ -252,7 +252,7 @@ ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $7, %zmm1, %zmm1 ; GFNIAVX512-NEXT: vpaddb %zmm0, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; GFNIAVX512-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; GFNIAVX512-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> ) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll --- a/llvm/test/CodeGen/X86/gfni-rotates.ll +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -61,7 +61,7 @@ ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $7, %xmm0, %xmm1 ; GFNIAVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; GFNIAVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; GFNIAVX512-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) ret <16 x i8> %res @@ -256,7 +256,7 @@ ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $7, %zmm0, %zmm1 ; GFNIAVX512-NEXT: vpaddb %zmm0, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; GFNIAVX512-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; GFNIAVX512-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> ) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -15,11 +15,17 @@ ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX-LABEL: splatconstant_shl_v16i8: -; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX-NEXT: retq +; GFNIAVX1OR2-LABEL: splatconstant_shl_v16i8: +; GFNIAVX1OR2: # %bb.0: +; GFNIAVX1OR2-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_shl_v16i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; GFNIAVX512-NEXT: retq %shift = shl <16 x i8> %a, ret <16 x i8> %shift } @@ -31,11 +37,17 @@ ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX-LABEL: splatconstant_lshr_v16i8: -; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: vpsrlw $7, %xmm0, %xmm0 -; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX-NEXT: retq +; GFNIAVX1OR2-LABEL: splatconstant_lshr_v16i8: +; GFNIAVX1OR2: # %bb.0: +; GFNIAVX1OR2-NEXT: vpsrlw $7, %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_lshr_v16i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpsrlw $7, %xmm0, %xmm0 +; GFNIAVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; GFNIAVX512-NEXT: retq %shift = lshr <16 x i8> %a, ret <16 x i8> %shift } @@ -63,7 +75,7 @@ ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 ; GFNIAVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; GFNIAVX512-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; GFNIAVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; GFNIAVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; GFNIAVX512-NEXT: retq %shift = ashr <16 x i8> %a, @@ -104,7 +116,7 @@ ; GFNIAVX512-LABEL: splatconstant_shl_v32i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $6, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %shift = shl <32 x i8> %a, ret <32 x i8> %shift @@ -140,7 +152,7 @@ ; GFNIAVX512-LABEL: splatconstant_lshr_v32i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $1, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %shift = lshr <32 x i8> %a, ret <32 x i8> %shift @@ -190,7 +202,7 @@ ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $2, %ymm0, %ymm0 ; GFNIAVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; GFNIAVX512-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; GFNIAVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; GFNIAVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %shift = ashr <32 x i8> %a, @@ -244,7 +256,7 @@ ; GFNIAVX512-LABEL: splatconstant_shl_v64i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $5, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; GFNIAVX512-NEXT: retq %shift = shl <64 x i8> %a, ret <64 x i8> %shift @@ -293,7 +305,7 @@ ; GFNIAVX512-LABEL: splatconstant_lshr_v64i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $7, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; GFNIAVX512-NEXT: retq %shift = lshr <64 x i8> %a, ret <64 x i8> %shift @@ -366,9 +378,11 @@ ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $1, %zmm0, %zmm0 ; GFNIAVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; GFNIAVX512-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; GFNIAVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; GFNIAVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0 ; GFNIAVX512-NEXT: retq %shift = ashr <64 x i8> %a, ret <64 x i8> %shift } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFNIAVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -5,8 +5,8 @@ ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefixes=X86-AVX,X86-AVX2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64-SSE42 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1OR2,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1OR2,X64-AVX2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64-AVX,X64-AVX512 ; @@ -86,21 +86,13 @@ ; X64-SSE42-NEXT: movq %xmm2, %rax ; X64-SSE42-NEXT: retq ; -; X64-AVX1-LABEL: test_reduce_v2i64: -; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_reduce_v2i64: -; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax -; X64-AVX2-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v2i64: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1OR2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX1OR2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX1OR2-NEXT: vmovq %xmm0, %rax +; X64-AVX1OR2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v2i64: ; X64-AVX512: ## %bb.0: @@ -249,14 +241,23 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v8i16: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax +; X64-AVX1OR2-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1OR2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> %2 = icmp sgt <8 x i16> %a0, %1 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1 @@ -370,16 +371,27 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorb $127, %al -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v16i8: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax +; X64-AVX1OR2-NEXT: xorb $127, %al +; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1OR2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorb $127, %al +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> %2 = icmp sgt <16 x i8> %a0, %1 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1 @@ -807,7 +819,7 @@ ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF @@ -991,7 +1003,7 @@ ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -1578,7 +1590,7 @@ ; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF @@ -1799,7 +1811,7 @@ ; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -1889,15 +1901,25 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v16i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax +; X64-AVX1OR2-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1OR2-NEXT: vzeroupper +; X64-AVX1OR2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> %2 = icmp sgt <16 x i16> %a0, %1 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 @@ -1966,15 +1988,25 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax +; X64-AVX1OR2-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1OR2-NEXT: vzeroupper +; X64-AVX1OR2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> %2 = icmp sgt <32 x i16> %a0, %1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 @@ -2089,17 +2121,29 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorb $127, %al -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax +; X64-AVX1OR2-NEXT: xorb $127, %al +; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1OR2-NEXT: vzeroupper +; X64-AVX1OR2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorb $127, %al +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> %2 = icmp sgt <32 x i8> %a0, %1 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 @@ -2217,17 +2261,29 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v64i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorb $127, %al -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax +; X64-AVX1OR2-NEXT: xorb $127, %al +; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1OR2-NEXT: vzeroupper +; X64-AVX1OR2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorb $127, %al +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> %2 = icmp sgt <64 x i8> %a0, %1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -5,8 +5,8 @@ ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefixes=X86-AVX,X86-AVX2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64-SSE42 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1OR2,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1OR2,X64-AVX2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64-AVX,X64-AVX512 ; @@ -88,21 +88,13 @@ ; X64-SSE42-NEXT: movq %xmm2, %rax ; X64-SSE42-NEXT: retq ; -; X64-AVX1-LABEL: test_reduce_v2i64: -; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_reduce_v2i64: -; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax -; X64-AVX2-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v2i64: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1OR2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX1OR2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX1OR2-NEXT: vmovq %xmm0, %rax +; X64-AVX1OR2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v2i64: ; X64-AVX512: ## %bb.0: @@ -251,14 +243,23 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v8i16: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax +; X64-AVX1OR2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1OR2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> %2 = icmp slt <8 x i16> %a0, %1 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1 @@ -372,16 +373,27 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: addb $-128, %al -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v16i8: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax +; X64-AVX1OR2-NEXT: addb $-128, %al +; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1OR2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: addb $-128, %al +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> %2 = icmp slt <16 x i8> %a0, %1 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1 @@ -811,7 +823,7 @@ ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -995,7 +1007,7 @@ ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -1582,7 +1594,7 @@ ; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -1803,7 +1815,7 @@ ; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -1893,15 +1905,25 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v16i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax +; X64-AVX1OR2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1OR2-NEXT: vzeroupper +; X64-AVX1OR2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> %2 = icmp slt <16 x i16> %a0, %1 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 @@ -1970,15 +1992,25 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax +; X64-AVX1OR2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1OR2-NEXT: vzeroupper +; X64-AVX1OR2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> %2 = icmp slt <32 x i16> %a0, %1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 @@ -2093,17 +2125,29 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: addb $-128, %al -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax +; X64-AVX1OR2-NEXT: addb $-128, %al +; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1OR2-NEXT: vzeroupper +; X64-AVX1OR2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: addb $-128, %al +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> %2 = icmp slt <32 x i8> %a0, %1 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 @@ -2221,17 +2265,29 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v64i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: addb $-128, %al -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1OR2-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX1OR2: ## %bb.0: +; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax +; X64-AVX1OR2-NEXT: addb $-128, %al +; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1OR2-NEXT: vzeroupper +; X64-AVX1OR2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: addb $-128, %al +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> %2 = icmp slt <64 x i8> %a0, %1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll --- a/llvm/test/CodeGen/X86/i64-to-float.ll +++ b/llvm/test/CodeGen/X86/i64-to-float.ll @@ -305,16 +305,16 @@ ; ; X86-AVX512F-LABEL: clamp_sitofp_2i64_2f64: ; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX512F-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0 +; X86-AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0 ; X86-AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X86-AVX512F-NEXT: retl ; ; X86-AVX512DQ-LABEL: clamp_sitofp_2i64_2f64: ; X86-AVX512DQ: # %bb.0: -; X86-AVX512DQ-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX512DQ-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX512DQ-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0 +; X86-AVX512DQ-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0 ; X86-AVX512DQ-NEXT: vcvtqq2pd %xmm0, %xmm0 ; X86-AVX512DQ-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll --- a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll @@ -182,7 +182,7 @@ define <8 x i1> @andnot_ne_v8i16(<8 x i16> %x) nounwind { ; AVX512-LABEL: andnot_ne_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 @@ -249,7 +249,7 @@ define <16 x i1> @andnot_ne_v16i8(<16 x i8> %x) nounwind { ; AVX512-LABEL: andnot_ne_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -2554,7 +2554,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -2587,7 +2587,7 @@ ; AVX512VLBW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2787,7 +2787,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -2820,7 +2820,7 @@ ; AVX512VLBW-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3029,7 +3029,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm0, %xmm2, %xmm0 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -3063,7 +3063,7 @@ ; AVX512VLBW-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %xmm0, %xmm2, %xmm0 {%k1} ; AVX512VLBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -3269,7 +3269,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -3303,7 +3303,7 @@ ; AVX512VLBW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3518,7 +3518,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -3553,7 +3553,7 @@ ; AVX512VLBW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -2026,7 +2026,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -2058,7 +2058,7 @@ ; AVX512VLBW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -2215,7 +2215,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -2247,7 +2247,7 @@ ; AVX512VLBW-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -2404,7 +2404,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm2, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -2437,7 +2437,7 @@ ; AVX512VLBW-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %ymm0, %ymm2, %ymm0 {%k1} ; AVX512VLBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 @@ -2593,7 +2593,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -2626,7 +2626,7 @@ ; AVX512VLBW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -2787,7 +2787,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -2821,7 +2821,7 @@ ; AVX512VLBW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -693,7 +693,7 @@ ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm2 @@ -722,7 +722,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm5, %ymm2 @@ -742,7 +742,7 @@ ; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 @@ -775,7 +775,7 @@ ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512F-NEXT: vpsubb %ymm2, %ymm4, %ymm2 @@ -804,7 +804,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm4, %ymm2 @@ -824,7 +824,7 @@ ; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 @@ -860,7 +860,7 @@ ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm1 @@ -890,7 +890,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm1 @@ -911,7 +911,7 @@ ; AVX512BW-NEXT: vpmaxsb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubb %zmm0, %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 @@ -946,7 +946,7 @@ ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm2 @@ -976,7 +976,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm5, %ymm2 @@ -997,7 +997,7 @@ ; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 @@ -1033,7 +1033,7 @@ ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm1 @@ -1064,7 +1064,7 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm1 @@ -1086,7 +1086,7 @@ ; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -1930,7 +1930,7 @@ ; CHECK-SKX-VBMI: # %bb.0: ; CHECK-SKX-VBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SKX-VBMI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-SKX-VBMI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; CHECK-SKX-VBMI-NEXT: vpermb %ymm0, %ymm2, %ymm0 ; CHECK-SKX-VBMI-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; CHECK-SKX-VBMI-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 @@ -1941,7 +1941,7 @@ ; ; CHECK-AVX512-LABEL: var_rotate_v16i8: ; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; CHECK-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; CHECK-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; CHECK-AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] @@ -1955,7 +1955,7 @@ ; CHECK-VBMI: # %bb.0: ; CHECK-VBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-VBMI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-VBMI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; CHECK-VBMI-NEXT: vpermb %ymm0, %ymm2, %ymm0 ; CHECK-VBMI-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; CHECK-VBMI-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 @@ -1973,7 +1973,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" { ; CHECK-LABEL: var_rotate_v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] @@ -2048,7 +2048,7 @@ ; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1 ; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 ; CHECK-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; CHECK-NEXT: retq %shl = shl <32 x i8> %a, %lshr = lshr <32 x i8> %a, diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -1315,7 +1315,7 @@ ; ; AVX512-LABEL: allzeros_v64i8_and1: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper @@ -1572,7 +1572,7 @@ ; ; AVX512-LABEL: allzeros_v32i16_and1: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper @@ -2461,7 +2461,7 @@ ; ; AVX512-LABEL: allzeros_v64i8_and4: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper @@ -2718,7 +2718,7 @@ ; ; AVX512-LABEL: allzeros_v32i16_and4: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -89,7 +89,7 @@ ; CHECK-NEXT: Local Dynamic TLS Access Clean-up ; CHECK-NEXT: X86 PIC Global Base Reg Initialization ; CHECK-NEXT: Argument Stack Rebase -; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: X86 Domain Reassignment Pass ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication @@ -204,6 +204,7 @@ ; CHECK-NEXT: X86 Atom pad short functions ; CHECK-NEXT: X86 LEA Fixup ; CHECK-NEXT: X86 Fixup Inst Tuning +; CHECK-NEXT: X86 Fixup Vector Constants ; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possible ; CHECK-NEXT: X86 Discriminate Memory Operands ; CHECK-NEXT: X86 Insert Cache Prefetches diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll --- a/llvm/test/CodeGen/X86/paddus.ll +++ b/llvm/test/CodeGen/X86/paddus.ll @@ -131,7 +131,7 @@ ; ; AVX512-LABEL: test5: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpltub %xmm0, %xmm1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} @@ -350,7 +350,7 @@ ; ; AVX512-LABEL: test11: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 ; AVX512-NEXT: vpcmpltub %ymm0, %ymm1, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} @@ -658,7 +658,7 @@ ; ; AVX512-LABEL: test17: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpltub %zmm0, %zmm1, %k1 ; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} @@ -852,7 +852,7 @@ ; ; AVX512-LABEL: test23: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpltuw %xmm0, %xmm1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} @@ -1103,7 +1103,7 @@ ; ; AVX512-LABEL: test29: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 ; AVX512-NEXT: vpcmpltuw %ymm0, %ymm1, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} @@ -1467,7 +1467,7 @@ ; ; AVX512-LABEL: test35: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpltuw %zmm0, %zmm1, %k1 ; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} diff --git a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll @@ -41,7 +41,7 @@ ; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX256-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX256-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX256-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX256-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 ; AVX256-NEXT: vpand %xmm3, %xmm2, %xmm2 @@ -92,7 +92,7 @@ ; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX256-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX256-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX256-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; AVX256-NEXT: vpand %ymm3, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll b/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll @@ -11,7 +11,7 @@ ; AVX256-NEXT: vpsrlw $8, %ymm0, %ymm1 ; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX256-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX256-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX256-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -75,7 +75,7 @@ ; AVX256-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX256-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX256-NEXT: vptestmd %ymm1, %ymm1, %k2 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovdqa %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/prefer-avx256-shift.ll b/llvm/test/CodeGen/X86/prefer-avx256-shift.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-shift.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-shift.ll @@ -11,10 +11,10 @@ ; AVX256: # %bb.0: ; AVX256-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX256-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX256-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX256-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX256-NEXT: vpaddb %ymm0, %ymm0, %ymm2 @@ -34,10 +34,10 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 @@ -115,10 +115,10 @@ ; AVX256VL: # %bb.0: ; AVX256VL-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX256VL-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX256VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX256VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 ; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX256VL-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX256VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX256VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 ; AVX256VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX256VL-NEXT: vpaddb %xmm0, %xmm0, %xmm2 @@ -153,14 +153,14 @@ ; AVX256: # %bb.0: ; AVX256-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX256-NEXT: vpsrlw $2, %ymm0, %ymm2 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX256-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX256-NEXT: vpsrlw $1, %ymm0, %ymm2 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX256-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX256-NEXT: retq @@ -177,14 +177,14 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -259,14 +259,14 @@ ; AVX256VL: # %bb.0: ; AVX256VL-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX256VL-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX256VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX256VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 ; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX256VL-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX256VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX256VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 ; AVX256VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX256VL-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX256VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX256VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 ; AVX256VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX256VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/prefer-avx256-trunc.ll b/llvm/test/CodeGen/X86/prefer-avx256-trunc.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-trunc.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-trunc.ll @@ -11,7 +11,7 @@ define <16 x i8> @testv16i16_trunc_v16i8(<16 x i16> %x) { ; AVX256NOBW-LABEL: testv16i16_trunc_v16i8: ; AVX256NOBW: # %bb.0: -; AVX256NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256NOBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX256NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX256NOBW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+prefer-256-bit | FileCheck %s --check-prefix=AVX256BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,-prefer-256-bit | FileCheck %s --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,-prefer-256-bit | FileCheck %s --check-prefix=AVX512BWVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+prefer-256-bit | FileCheck %s --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,-prefer-256-bit | FileCheck %s --check-prefix=AVX512BW @@ -18,12 +18,26 @@ ; AVX256BW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX256BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX256BW-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX256BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX256BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX256BW-NEXT: vpsrlw $2, %ymm0, %ymm0 -; AVX256BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX256BW-NEXT: retq ; +; AVX512BWVL-LABEL: test_div7_32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BWVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BWVL-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BWVL-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BWVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsrlw $2, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: retq +; ; AVX512BW-LABEL: test_div7_32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero @@ -56,6 +70,14 @@ ; AVX256BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX256BW-NEXT: retq ; +; AVX512BWVL-LABEL: test_mul_32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BWVL-NEXT: retq +; ; AVX512BW-LABEL: test_mul_32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -76,15 +76,35 @@ ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ashr_xor_and_commute_uses: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdi) -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: ashr_xor_and_commute_uses: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ashr_xor_and_commute_uses: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ashr_xor_and_commute_uses: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %signsplat = ashr <16 x i8> %x, store <16 x i8> %signsplat, ptr %p1 %flipsign = xor <16 x i8> %x, diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -105,14 +105,14 @@ ; X86: # %bb.0: ; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; X86-NEXT: vprold $7, %xmm0, %xmm0 -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vrolw_extract_mul_with_mask: ; X64: # %bb.0: ; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-NEXT: vprold $7, %xmm0, %xmm0 -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-NEXT: retq %lhs_mul = mul <4 x i32> %i, %rhs_mul = mul <4 x i32> %i, @@ -151,7 +151,7 @@ ; X86: # %bb.0: ; X86-NEXT: vpsllq $24, %ymm0, %ymm1 ; X86-NEXT: vpsrlq $39, %ymm0, %ymm0 -; X86-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0 +; X86-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: no_extract_shl: diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll --- a/llvm/test/CodeGen/X86/rotate_vec.ll +++ b/llvm/test/CodeGen/X86/rotate_vec.ll @@ -45,7 +45,7 @@ ; AVX512-LABEL: rot_v4i32_splat_2masks: ; AVX512: # %bb.0: ; AVX512-NEXT: vprold $31, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = lshr <4 x i32> %x, %2 = and <4 x i32> %1, @@ -123,7 +123,7 @@ ; AVX512-LABEL: rot_v4i32_mask_ashr0: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = ashr <4 x i32> %a0, %2 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %1, <4 x i32> %1, <4 x i32> ) @@ -151,7 +151,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrad $25, %xmm0, %xmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = ashr <4 x i32> %a0, %2 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %1, <4 x i32> %1, <4 x i32> ) diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -567,7 +567,7 @@ ; AVX512BW-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -2489,7 +2489,7 @@ ; CHECK-AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 ; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; CHECK-AVX512VL-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0 ; CHECK-AVX512VL-NEXT: vpandn %ymm0, %ymm3, %ymm3 diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -567,7 +567,7 @@ ; AVX512BW-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) @@ -601,7 +601,7 @@ ; ; AVX512BW-LABEL: v16i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpternlogq $96, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512BW-NEXT: vpternlogd $96, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512BW-NEXT: retq %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -527,7 +527,7 @@ ; ; AVX512BW-LABEL: v16i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpternlogq $96, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512BW-NEXT: vpternlogd $96, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512BW-NEXT: retq %z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll @@ -73,13 +73,13 @@ define <8 x half> @uitofp_v8i1_v8f16(<8 x i1> %x) #0 { ; X86-LABEL: uitofp_v8i1_v8f16: ; X86: # %bb.0: -; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; X86-NEXT: vcvtuw2ph %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: uitofp_v8i1_v8f16: ; X64: # %bb.0: -; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-NEXT: vcvtuw2ph %xmm0, %xmm0 ; X64-NEXT: retq %result = call <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i1(<8 x i1> %x, diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll @@ -32,14 +32,14 @@ define <16 x half> @uitofp_v16i1_v16f16(<16 x i1> %x) #0 { ; X86-LABEL: uitofp_v16i1_v16f16: ; X86: # %bb.0: -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; X86-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X86-NEXT: vcvtuw2ph %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: uitofp_v16i1_v16f16: ; X64: # %bb.0: -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-NEXT: vcvtuw2ph %ymm0, %ymm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -142,14 +142,14 @@ ; ; AVX512VL-32-LABEL: uitofp_v8i1_v8f32: ; AVX512VL-32: # %bb.0: -; AVX512VL-32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-32-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX512VL-32-NEXT: retl ; ; AVX512VL-64-LABEL: uitofp_v8i1_v8f32: ; AVX512VL-64: # %bb.0: -; AVX512VL-64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-64-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX512VL-64-NEXT: retq @@ -170,14 +170,14 @@ ; ; AVX512DQVL-32-LABEL: uitofp_v8i1_v8f32: ; AVX512DQVL-32: # %bb.0: -; AVX512DQVL-32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; AVX512DQVL-32-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; AVX512DQVL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-32-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX512DQVL-32-NEXT: retl ; ; AVX512DQVL-64-LABEL: uitofp_v8i1_v8f32: ; AVX512DQVL-64: # %bb.0: -; AVX512DQVL-64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512DQVL-64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512DQVL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-64-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX512DQVL-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll @@ -30,14 +30,14 @@ define <32 x half> @uitofp_v32i1_v32f16(<32 x i1> %x) #0 { ; X86-LABEL: uitofp_v32i1_v32f16: ; X86: # %bb.0: -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 ; X86-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; X86-NEXT: vcvtuw2ph %zmm0, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: uitofp_v32i1_v32f16: ; X64: # %bb.0: -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; X64-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; X64-NEXT: vcvtuw2ph %zmm0, %zmm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -536,7 +536,7 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-NEXT: vpslld $16, %ymm0, %ymm0 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 @@ -805,7 +805,7 @@ ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 @@ -849,7 +849,7 @@ ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -863,7 +863,7 @@ ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLVBMI2-NEXT: vpsllvw %ymm0, %ymm3, %ymm0 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -385,7 +385,7 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpslld $16, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpsrld $16, %zmm0, %zmm0 @@ -614,10 +614,10 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6 @@ -628,10 +628,10 @@ ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 @@ -671,7 +671,7 @@ ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpsllw $8, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -684,7 +684,7 @@ ; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95] ; AVX512VLVBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLVBMI2-NEXT: vpsllvw %zmm0, %zmm3, %zmm0 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -138,7 +138,7 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpslld $16, %zmm4, %zmm4 ; AVX512F-NEXT: vpord %zmm3, %zmm4, %zmm3 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vpsllvd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpsrld $16, %zmm3, %zmm3 @@ -163,7 +163,7 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpslld $16, %zmm4, %zmm4 ; AVX512VL-NEXT: vpord %zmm3, %zmm4, %zmm3 -; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm3, %zmm3 ; AVX512VL-NEXT: vpsrld $16, %zmm3, %zmm3 @@ -227,7 +227,7 @@ ; AVX512F-NEXT: vpsrlw $4, %ymm5, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm7 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpxor %ymm3, %ymm8, %ymm9 @@ -296,7 +296,7 @@ ; AVX512VL-NEXT: vpsrlw $4, %ymm5, %ymm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm7 -; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpxor %ymm3, %ymm8, %ymm9 @@ -359,7 +359,7 @@ ; AVX512BW-LABEL: var_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512BW-NEXT: vpsllvw %zmm5, %zmm3, %zmm3 @@ -374,7 +374,7 @@ ; AVX512VBMI2-LABEL: var_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm3, %zmm3 @@ -389,7 +389,7 @@ ; AVX512VLBW-LABEL: var_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VLBW-NEXT: vpsllvw %zmm5, %zmm3, %zmm3 @@ -404,7 +404,7 @@ ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VLVBMI2-NEXT: vpsllvw %zmm5, %zmm3, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -401,7 +401,7 @@ ; AVX512VL-LABEL: var_funnnel_v8i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -427,7 +427,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 @@ -602,7 +602,7 @@ ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VL-NEXT: vpslld $8, %zmm0, %zmm2 ; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpsrld $8, %zmm0, %zmm0 @@ -625,7 +625,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] @@ -654,7 +654,7 @@ ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLVBMI2-NEXT: vpermb %ymm0, %ymm2, %ymm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -310,7 +310,7 @@ ; AVX512VL-LABEL: var_funnnel_v16i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] @@ -334,7 +334,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 @@ -485,7 +485,7 @@ ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -503,7 +503,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30,32,32,34,34,36,36,38,38,40,40,42,42,44,44,46,46,48,48,50,50,52,52,54,54,56,56,58,58,60,60,62,62] @@ -529,7 +529,7 @@ ; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLVBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -90,7 +90,7 @@ ; ; AVX512BW-LABEL: var_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 @@ -100,7 +100,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 @@ -206,7 +206,7 @@ ; ; AVX512BW-LABEL: var_funnnel_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -221,7 +221,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -236,7 +236,7 @@ ; ; AVX512VBMI2-LABEL: var_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -251,7 +251,7 @@ ; ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -594,7 +594,7 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-NEXT: vpslld $16, %ymm0, %ymm0 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 @@ -925,7 +925,7 @@ ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 @@ -938,7 +938,7 @@ ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm0, %ymm3, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -414,7 +414,7 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpslld $16, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 @@ -645,13 +645,13 @@ ; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm4 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm4 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -659,10 +659,10 @@ ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 @@ -700,7 +700,7 @@ ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpsllw $8, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 @@ -712,7 +712,7 @@ ; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95] ; AVX512VLVBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %zmm0, %zmm3, %zmm0 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -142,7 +142,7 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpslld $16, %zmm4, %zmm4 ; AVX512F-NEXT: vpord %zmm3, %zmm4, %zmm3 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 @@ -165,7 +165,7 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpslld $16, %zmm4, %zmm4 ; AVX512VL-NEXT: vpord %zmm3, %zmm4, %zmm3 -; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 ; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 @@ -227,7 +227,7 @@ ; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm6 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpxor %ymm7, %ymm3, %ymm8 @@ -294,7 +294,7 @@ ; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6 -; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpxor %ymm7, %ymm3, %ymm8 @@ -357,7 +357,7 @@ ; AVX512BW-LABEL: var_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 @@ -373,7 +373,7 @@ ; AVX512VBMI2-LABEL: var_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 @@ -387,7 +387,7 @@ ; AVX512VLBW-LABEL: var_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VLBW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 @@ -403,7 +403,7 @@ ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VLVBMI2-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -419,7 +419,7 @@ ; AVX512VL-LABEL: var_funnnel_v8i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512VL-NEXT: vpsrlvd %ymm2, %ymm0, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -445,7 +445,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 @@ -625,7 +625,7 @@ ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VL-NEXT: vpslld $8, %zmm0, %zmm2 ; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 @@ -646,7 +646,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] @@ -673,7 +673,7 @@ ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLVBMI2-NEXT: vpermb %ymm0, %ymm2, %ymm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -326,7 +326,7 @@ ; AVX512VL-LABEL: var_funnnel_v16i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] @@ -350,7 +350,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 @@ -527,7 +527,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30,32,32,34,34,36,36,38,38,40,40,42,42,44,44,46,46,48,48,50,50,52,52,54,54,56,56,58,58,60,60,62,62] @@ -551,7 +551,7 @@ ; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -90,7 +90,7 @@ ; ; AVX512BW-LABEL: var_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 @@ -100,7 +100,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 @@ -204,7 +204,7 @@ ; ; AVX512BW-LABEL: var_funnnel_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -220,7 +220,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -236,7 +236,7 @@ ; ; AVX512VBMI2-LABEL: var_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -250,7 +250,7 @@ ; ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -182,9 +182,9 @@ ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -269,7 +269,7 @@ ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq %res = sdiv <64 x i8> %a, @@ -507,13 +507,13 @@ ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 +; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm3, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -615,7 +615,7 @@ ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -162,7 +162,7 @@ ; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_div7_64i8: @@ -178,10 +178,10 @@ ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %res = udiv <64 x i8> %a, ret <64 x i8> %res @@ -524,12 +524,12 @@ ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -3,10 +3,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=NOBW,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=NOBW,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=NOBW,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=AVX512VLBWDQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=NOBW,AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=NOBW,AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=NOBW,AVX,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefixes=AVX512VLBWDQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd,+avx512vl | FileCheck %s --check-prefixes=NOBW,AVX512,AVX512VLCD ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd | FileCheck %s --check-prefixes=NOBW,AVX512,AVX512CD ; @@ -158,40 +158,68 @@ ; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 -; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: testv2i64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1OR2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX1OR2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 +; AVX1OR2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: testv2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 +; AVX512VL-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv2i64: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -408,40 +436,68 @@ ; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv2i64u: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 -; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: testv2i64u: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1OR2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX1OR2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 +; AVX1OR2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: testv2i64u: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 +; AVX512VL-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv2i64u: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -650,35 +706,58 @@ ; SSE41-NEXT: paddd %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: testv4i32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1OR2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX1OR2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: testv4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv4i32: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -876,35 +955,58 @@ ; SSE41-NEXT: paddd %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv4i32u: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: testv4i32u: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1OR2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX1OR2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: testv4i32u: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv4i32u: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -1078,30 +1180,48 @@ ; SSE41-NEXT: paddw %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: testv8i16: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1OR2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX1OR2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: testv8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv8i16: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -1268,30 +1388,48 @@ ; SSE41-NEXT: paddw %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv8i16u: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: testv8i16u: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1OR2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX1OR2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: testv8i16u: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv8i16u: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -1442,25 +1580,38 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: testv16i8: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1OR2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1OR2-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX1OR2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: testv16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv16i8: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 ; AVX512VLBWDQ-NEXT: vpand %xmm3, %xmm2, %xmm2 @@ -1592,25 +1743,38 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv16i8u: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: testv16i8u: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1OR2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1OR2-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX1OR2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1OR2-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: testv16i8u: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv16i8u: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 ; AVX512VLBWDQ-NEXT: vpand %xmm3, %xmm2, %xmm2 @@ -1841,3 +2005,5 @@ declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll @@ -96,7 +96,7 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -124,7 +124,7 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -278,7 +278,7 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -306,7 +306,7 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -445,7 +445,7 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -468,7 +468,7 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -597,7 +597,7 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -620,7 +620,7 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -734,7 +734,7 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -752,7 +752,7 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -851,7 +851,7 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -869,7 +869,7 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -953,7 +953,7 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -966,7 +966,7 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -1045,7 +1045,7 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -1058,7 +1058,7 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpand %ymm3, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -360,7 +360,7 @@ ; AVX512BW-LABEL: testv32i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2 @@ -444,7 +444,7 @@ ; AVX512BW-LABEL: testv32i16u: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2 @@ -548,7 +548,7 @@ ; AVX512BW-LABEL: testv64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 @@ -636,7 +636,7 @@ ; AVX512BW-LABEL: testv64i8u: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -81,7 +81,7 @@ ; X64-AVX512DQ-LABEL: mul_v16i8_32: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $5, %xmm0, %xmm0 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <16 x i8> %a0, ret <16 x i8> %1 @@ -417,7 +417,7 @@ ; X64-AVX512DQ-LABEL: mul_v16i8_17: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $4, %xmm0, %xmm1 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <16 x i8> %a0, @@ -585,7 +585,7 @@ ; X64-AVX512DQ-LABEL: mul_v32i8_17: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm1 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; X64-AVX512DQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <32 x i8> %a0, @@ -725,7 +725,7 @@ ; X64-AVX512DQ-LABEL: mul_v16i8_neg5: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $2, %xmm0, %xmm1 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vpsubb %xmm0, %xmm1, %xmm0 @@ -933,7 +933,7 @@ ; X64-AVX512DQ-LABEL: mul_v32i8_neg5: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm1 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; X64-AVX512DQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vpsubb %ymm0, %ymm1, %ymm0 @@ -1261,7 +1261,7 @@ ; X64-AVX512DQ-LABEL: mul_v16i8_31: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $5, %xmm0, %xmm1 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vpsubb %xmm0, %xmm1, %xmm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <16 x i8> %a0, @@ -1385,7 +1385,7 @@ ; X64-AVX512DQ-LABEL: mul_v16i8_neg15: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $4, %xmm0, %xmm1 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <16 x i8> %a0, diff --git a/llvm/test/CodeGen/X86/vector-pack-128.ll b/llvm/test/CodeGen/X86/vector-pack-128.ll --- a/llvm/test/CodeGen/X86/vector-pack-128.ll +++ b/llvm/test/CodeGen/X86/vector-pack-128.ll @@ -95,12 +95,26 @@ ; SSE-NEXT: packsswb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: trunc_concat_packsswb_128: -; AVX: # %bb.0: -; AVX-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: trunc_concat_packsswb_128: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_concat_packsswb_128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_concat_packsswb_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = ashr <8 x i16> %a0, %2 = and <8 x i16> %a1, %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> @@ -116,12 +130,26 @@ ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: trunc_concat_packuswb_128: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: trunc_concat_packuswb_128: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_concat_packuswb_128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_concat_packuswb_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = lshr <8 x i16> %a0, %2 = and <8 x i16> %a1, %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> @@ -231,12 +259,26 @@ ; SSE-NEXT: packsswb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: concat_trunc_packsswb_128: -; AVX: # %bb.0: -; AVX-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: concat_trunc_packsswb_128: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_trunc_packsswb_128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_trunc_packsswb_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = ashr <8 x i16> %a0, %2 = and <8 x i16> %a1, %3 = trunc <8 x i16> %1 to <8 x i8> @@ -253,12 +295,26 @@ ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: concat_trunc_packuswb_128: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: concat_trunc_packuswb_128: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_trunc_packuswb_128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_trunc_packuswb_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = lshr <8 x i16> %a0, %2 = and <8 x i16> %a1, %3 = trunc <8 x i16> %1 to <8 x i8> @@ -266,3 +322,5 @@ %5 = shufflevector <8 x i8> %3, <8 x i8> %4, <16 x i32> ret <16 x i8> %5 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-pack-256.ll b/llvm/test/CodeGen/X86/vector-pack-256.ll --- a/llvm/test/CodeGen/X86/vector-pack-256.ll +++ b/llvm/test/CodeGen/X86/vector-pack-256.ll @@ -103,7 +103,7 @@ ; AVX512F-LABEL: trunc_concat_packsswb_256: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero @@ -116,7 +116,7 @@ ; AVX512BW-LABEL: trunc_concat_packsswb_256: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -152,7 +152,7 @@ ; AVX512F-LABEL: trunc_concat_packuswb_256: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero @@ -165,7 +165,7 @@ ; AVX512BW-LABEL: trunc_concat_packuswb_256: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $15, %ymm0, %ymm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -254,7 +254,7 @@ ; AVX512-NEXT: vpsrld $17, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512-NEXT: vpmovdw %ymm1, %xmm1 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -304,7 +304,7 @@ ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -315,7 +315,7 @@ ; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -365,7 +365,7 @@ ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -376,7 +376,7 @@ ; AVX512BW-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-pack-512.ll b/llvm/test/CodeGen/X86/vector-pack-512.ll --- a/llvm/test/CodeGen/X86/vector-pack-512.ll +++ b/llvm/test/CodeGen/X86/vector-pack-512.ll @@ -51,7 +51,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15] @@ -74,7 +74,7 @@ ; AVX512BW-LABEL: trunc_concat_packsswb_512: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] @@ -97,7 +97,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15] @@ -120,7 +120,7 @@ ; AVX512BW-LABEL: trunc_concat_packuswb_512: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] @@ -195,7 +195,7 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] @@ -207,7 +207,7 @@ ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] @@ -238,7 +238,7 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] @@ -250,7 +250,7 @@ ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll --- a/llvm/test/CodeGen/X86/vector-pcmp.ll +++ b/llvm/test/CodeGen/X86/vector-pcmp.ll @@ -255,11 +255,23 @@ ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: cmpeq_zext_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: cmpeq_zext_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: cmpeq_zext_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: cmpeq_zext_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: retq %cmp = icmp eq <16 x i8> %a, %b %zext = zext <16 x i1> %cmp to <16 x i8> ret <16 x i8> %zext @@ -394,7 +406,7 @@ ; AVX512-LABEL: cmpgt_zext_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512-NEXT: retq %cmp = icmp sgt <32 x i8> %a, %b %zext = zext <32 x i1> %cmp to <32 x i8> diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -28,13 +28,37 @@ ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v2i64_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +; AVX1-LABEL: test_v2i64_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v2i64_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq +; +; AVX512BW-LABEL: test_v2i64_v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v2i64_v2i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, %rax +; AVX512BWVL-NEXT: retq %1 = and <2 x i64> %a0, %2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %1) ret i64 %2 @@ -271,19 +295,33 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqb %zmm1, %xmm1 -; AVX512-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v16i64_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v16i64_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, %rax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %1 = and <16 x i64> %a0, %2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %1) ret i64 %2 @@ -1015,7 +1053,7 @@ ; AVX512BWVL-LABEL: test_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1241,7 +1279,7 @@ ; AVX512-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -436,7 +436,7 @@ ; ; AVX512-LABEL: trunc_v32i16_v32i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper @@ -485,7 +485,7 @@ ; ; AVX512-LABEL: trunc_v64i8_v64i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -987,7 +987,7 @@ ; AVX512-LABEL: mask_v128i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -1208,14 +1208,23 @@ ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512VL-NEXT: retq %1 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a0) ret i16 %1 } @@ -1269,17 +1278,29 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -1341,19 +1362,33 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v32i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -1431,20 +1466,35 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v64i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v64i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v64i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -1675,16 +1725,27 @@ ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorb $127, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: xorb $127, %al +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: xorb $127, %al +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: retq %1 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a0) ret i8 %1 } @@ -1768,19 +1829,33 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v32i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorb $127, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: xorb $127, %al +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: xorb $127, %al +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -1880,21 +1955,37 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v64i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorb $127, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: xorb $127, %al +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v64i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: xorb $127, %al +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -2026,22 +2117,39 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v128i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorb $127, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v128i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: xorb $127, %al +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v128i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: xorb $127, %al +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %a0) ret i8 %1 } diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -1208,14 +1208,23 @@ ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512VL-NEXT: retq %1 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a0) ret i16 %1 } @@ -1269,17 +1278,29 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -1341,19 +1362,33 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v32i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -1431,20 +1466,35 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v64i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v64i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v64i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -1675,16 +1725,27 @@ ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: addb $-128, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: addb $-128, %al +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: addb $-128, %al +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: retq %1 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a0) ret i8 %1 } @@ -1768,19 +1829,33 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v32i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: addb $-128, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: addb $-128, %al +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: addb $-128, %al +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -1880,21 +1955,37 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v64i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: addb $-128, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: addb $-128, %al +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v64i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: addb $-128, %al +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -2026,22 +2117,39 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v128i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: addb $-128, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v128i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: addb $-128, %al +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v128i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: addb $-128, %al +; AVX512VL-NEXT: # kill: def $al killed $al killed $eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %a0) ret i8 %1 } diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -331,7 +331,7 @@ ; AVX512VL-LABEL: var_rotate_v8i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -357,7 +357,7 @@ ; ; AVX512VLBW-LABEL: var_rotate_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 @@ -535,7 +535,7 @@ ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VL-NEXT: vpslld $8, %zmm0, %zmm2 ; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpsrld $8, %zmm0, %zmm0 @@ -558,7 +558,7 @@ ; ; AVX512VLBW-LABEL: var_rotate_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] @@ -587,7 +587,7 @@ ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLVBMI2-NEXT: vpermb %ymm0, %ymm2, %ymm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -1708,7 +1708,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $5, %xmm0, %xmm1 ; AVX512VL-NEXT: vpsrlw $11, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v8i16: @@ -1723,7 +1723,7 @@ ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $5, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $11, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VLBW-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i16: @@ -1737,7 +1737,7 @@ ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v8i16: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpshldw $5, %xmm0, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v8i16: @@ -1798,7 +1798,7 @@ ; AVX512VLX-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLX-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512VLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 -; AVX512VLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VLX-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -251,7 +251,7 @@ ; AVX512VL-LABEL: var_rotate_v16i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] @@ -275,7 +275,7 @@ ; ; AVX512VLBW-LABEL: var_rotate_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 @@ -429,7 +429,7 @@ ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -447,7 +447,7 @@ ; ; AVX512VLBW-LABEL: var_rotate_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30,32,32,34,34,36,36,38,38,40,40,42,42,44,44,46,46,48,48,50,50,52,52,54,54,56,56,58,58,60,60,62,62] @@ -473,7 +473,7 @@ ; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLVBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -1554,7 +1554,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v16i16: @@ -1569,7 +1569,7 @@ ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $5, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $11, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i16: @@ -1582,7 +1582,7 @@ ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v16i16: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpshldw $5, %ymm0, %ymm0, %ymm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16: @@ -1652,7 +1652,7 @@ ; AVX512VLX-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 -; AVX512VLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VLX-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -91,7 +91,7 @@ ; ; AVX512BW-LABEL: var_rotate_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 @@ -101,7 +101,7 @@ ; ; AVX512VLBW-LABEL: var_rotate_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 @@ -210,7 +210,7 @@ ; ; AVX512BW-LABEL: var_rotate_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -225,7 +225,7 @@ ; ; AVX512VLBW-LABEL: var_rotate_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -240,7 +240,7 @@ ; ; AVX512VBMI2-LABEL: var_rotate_v64i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -255,7 +255,7 @@ ; ; AVX512VLVBMI2-LABEL: var_rotate_v64i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -840,7 +840,7 @@ ; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $11, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16: @@ -852,33 +852,33 @@ ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $11, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512VL-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogd $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq %shl = shl <32 x i16> %a, %lshr = lshr <32 x i16> %a, @@ -899,7 +899,7 @@ ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8: @@ -912,7 +912,7 @@ ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8: @@ -920,7 +920,7 @@ ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8: @@ -928,7 +928,7 @@ ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v64i8: @@ -936,7 +936,7 @@ ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v64i8: @@ -944,7 +944,7 @@ ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq %shl = shl <64 x i8> %a, %lshr = lshr <64 x i8> %a, diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1177,7 +1177,7 @@ ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 @@ -1187,7 +1187,7 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 @@ -1735,7 +1735,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -1323,7 +1323,7 @@ ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 @@ -1995,7 +1995,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -465,7 +465,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -2335,7 +2335,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -2388,7 +2388,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -2441,7 +2441,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -980,7 +980,7 @@ ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 @@ -990,7 +990,7 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 @@ -1463,7 +1463,7 @@ ; AVX512VL-LABEL: splatconstant_shift_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -446,14 +446,14 @@ ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsrlw $2, %ymm0, %ymm2 -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsrlw $1, %ymm0, %ymm2 -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: retq @@ -1093,7 +1093,7 @@ ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 @@ -1680,7 +1680,7 @@ ; AVX512VL-LABEL: splatconstant_shift_v32i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; X86-AVX1-LABEL: splatconstant_shift_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -86,17 +86,17 @@ ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} @@ -393,13 +393,13 @@ ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -2034,7 +2034,7 @@ ; AVX512VL-LABEL: splatconstant_shift_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v8i8: @@ -2073,7 +2073,7 @@ ; AVX512VL-LABEL: splatconstant_shift_v4i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v4i8: @@ -2112,7 +2112,7 @@ ; AVX512VL-LABEL: splatconstant_shift_v2i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v2i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -884,7 +884,7 @@ ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpslld %xmm1, %zmm0, %zmm0 @@ -894,7 +894,7 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 @@ -1327,7 +1327,7 @@ ; AVX512VL-LABEL: splatconstant_shift_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -381,10 +381,10 @@ ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 @@ -1013,7 +1013,7 @@ ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 @@ -1568,7 +1568,7 @@ ; AVX512VL-LABEL: splatconstant_shift_v32i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; X86-AVX1-LABEL: splatconstant_shift_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -83,12 +83,12 @@ ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} @@ -380,13 +380,13 @@ ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = shl <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -1825,7 +1825,7 @@ ; AVX512VL-LABEL: splatconstant_shift_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v8i8: @@ -1864,7 +1864,7 @@ ; AVX512VL-LABEL: splatconstant_shift_v4i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v4i8: @@ -1903,7 +1903,7 @@ ; AVX512VL-LABEL: splatconstant_shift_v2i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v2i8: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -160,7 +160,7 @@ define <16 x float> @shuffle_v16f32_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x float> %a) { ; ALL-LABEL: shuffle_v16f32_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; ALL: # %bb.0: -; ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; ALL-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; ALL-NEXT: retq %tmp1 = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> %tmp2 = shufflevector <16 x float> %tmp1, <16 x float> , <16 x i32> diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll --- a/llvm/test/CodeGen/X86/vselect-pcmp.ll +++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll @@ -612,7 +612,7 @@ ; ; AVX512VL-LABEL: blend_splat1_mask_cond_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 @@ -651,7 +651,7 @@ ; ; AVX512VL-LABEL: blend_splat1_mask_cond_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpternlogq $202, %xmm2, %xmm1, %xmm0 @@ -756,7 +756,7 @@ ; ; AVX512VL-LABEL: blend_splatmax_mask_cond_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpternlogq $202, %xmm2, %xmm1, %xmm0 @@ -803,7 +803,7 @@ ; ; AVX512VL-LABEL: blend_splatmax_mask_cond_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 @@ -941,7 +941,7 @@ ; ; AVX512VL-LABEL: blend_splat_mask_cond_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 @@ -980,7 +980,7 @@ ; ; AVX512VL-LABEL: blend_splat_mask_cond_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpternlogq $202, %xmm2, %xmm1, %xmm0